# -*- mode: python; coding: utf-8 -*- # Copyright (C) 2017-2020 by BDE ENS Paris-Saclay # SPDX-License-Identifier: GPL-3.0-or-later import json import os import re import unicodedata from urllib.error import HTTPError import urllib.request from django.db.models import QuerySet from django.forms import ModelForm from django.utils.translation import gettext_lazy as _ from .models import Auteur, BD from .scraper import BedetequeScraper def generate_side_identifier(title, authors, subtitle=None): if isinstance(authors, QuerySet): authors = list(authors) title_normalized = title.upper() title_normalized = title_normalized.replace('’', '\'') title_normalized = re.sub(r'^DE ', '', title_normalized) title_normalized = re.sub(r'^DES ', '', title_normalized) title_normalized = re.sub(r'^LE ', '', title_normalized) title_normalized = re.sub(r'^LA ', '', title_normalized) title_normalized = re.sub(r'^LES ', '', title_normalized) title_normalized = re.sub(r'^L\'', '', title_normalized) title_normalized = re.sub(r'^UN ', '', title_normalized) title_normalized = re.sub(r'^UNE ', '', title_normalized) title_normalized = re.sub(r'^THE ', '', title_normalized) title_normalized = re.sub(r'Œ', 'OE', title_normalized) title_normalized = title_normalized.replace(' ', '') title_normalized = ''.join( char for char in unicodedata.normalize( 'NFKD', title_normalized.casefold()) if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' ' ).casefold().upper() title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized) authors = authors.copy() def sort(author): return "{:042d}".format(-author.note) + author.name.split(" ")[-1]\ + ".{:042d}".format(author.pk) authors.sort(key=sort) primary_author = authors[0] author_name = primary_author.name.upper() if ',' not in author_name and ' ' in author_name: author_name = author_name.split(' ')[-1] author_name = ''.join( char for char in unicodedata.normalize('NFKD', author_name.casefold()) if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' ' ).casefold().upper() author_name = re.sub("[^A-Z]", "", author_name) side_identifier = "{:.3} {:.3}".format(author_name, title_normalized, ) if subtitle: subtitle = re.sub(r'', '', subtitle) subtitle = re.sub(r'', '', subtitle) start = subtitle.split(' ')[0].replace('.', '') start = re.sub("^R?", "", start) if start.isnumeric(): side_identifier += " {:0>2}".format(start, ) # Normalize side identifier, in order to remove accents side_identifier = ''.join( char for char in unicodedata.normalize('NFKD', side_identifier.casefold()) if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' ').casefold().upper() return side_identifier class MediaAdminForm(ModelForm): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) isbn_field = self.fields.get('isbn') if isbn_field: isbn_field.widget.template_name = "media/isbn_button.html" isbn_field.widget.attrs.update({'autofocus': 'autofocus'}) side_identifier_field = self.fields.get('side_identifier') if side_identifier_field and self.instance and self.instance.pk: instance = self.instance title, authors, subtitle = instance.title,\ instance.authors.all(), None if hasattr(instance, "subtitle"): subtitle = instance.subtitle side_identifier_field.widget.attrs.update( {'data-generated-side-identifier': generate_side_identifier(title, authors, subtitle)}) side_identifier_field.widget.template_name =\ "media/generate_side_identifier.html" def download_data_isbndb(self, isbn): api_url = "https://api2.isbndb.com/book/" + str(isbn)\ + "?Authorization=" + os.getenv("ISBNDB_KEY", "") req = urllib.request.Request(api_url) req.add_header("Authorization", os.getenv("ISBNDB_KEY", "")) try: with urllib.request.urlopen(req) as url: data: dict = json.loads(url.read().decode())["book"] except HTTPError: return False print(data) data.setdefault("title", "") data.setdefault("date_published", "1970-01-01") data.setdefault("pages", 0) data.setdefault("authors", []) data.setdefault("image", "") self.cleaned_data["title"] = data["title"] self.cleaned_data["publish_date"] = data["date_published"][:10] while len(self.cleaned_data["publish_date"]) == 4 \ or len(self.cleaned_data["publish_date"]) == 7: self.cleaned_data["publish_date"] += "-01" self.cleaned_data["number_of_pages"] = data["pages"] self.cleaned_data["authors"] = \ list(Auteur.objects.get_or_create(name=author_name)[0] for author_name in data["authors"]) self.cleaned_data["external_url"] = data["image"] return True def download_data_bedeteque(self, isbn): """ Download data from bedeteque :return True if success """ scraper = BedetequeScraper() r = scraper.search_by_isbn(isbn) if not r: return False # If results, then take the most accurate data = scraper.scrap_bd_info(r[0]) self.cleaned_data.update(data) return True def download_data_google(self, isbn): """ Download data from google books :return True if success """ api_url = "https://www.googleapis.com/books/v1/volumes?q=ISBN:{}"\ .format(isbn) with urllib.request.urlopen(api_url) as url: data = json.loads(url.read().decode()) if data and data['totalItems']: fetched_item = None for item in data['items']: for identifiers in item["volumeInfo"]["industryIdentifiers"]: if identifiers["identifier"] == isbn: fetched_item = item break if fetched_item: break if not fetched_item: return False # Fill the data self.parse_data_google(fetched_item) return True return False def parse_data_google(self, data): info = data['volumeInfo'] self.cleaned_data['external_url'] = info['canonicalVolumeLink'] if 'title' in info: self.cleaned_data['title'] = info['title'] if 'subtitle' in data: self.cleaned_data['subtitle'] = info['subtitle'] if 'pageCount' in info: self.cleaned_data['number_of_pages'] = \ info['pageCount'] elif not self.cleaned_data['number_of_pages']: self.cleaned_data['number_of_pages'] = 0 if 'publishedDate' in info: if "-" not in info["publishedDate"]: info["publishedDate"] += "-01-01" elif len(info["publishedDate"]) == 7: info["publishedDate"] += "-01" self.cleaned_data['publish_date'] = info['publishedDate'][:10] if 'authors' not in self.cleaned_data \ or not self.cleaned_data['authors']: self.cleaned_data['authors'] = list() if 'authors' in info: for author in info['authors']: author_obj = Auteur.objects.get_or_create( name=author)[0] self.cleaned_data['authors'].append(author_obj) print(self.cleaned_data) def download_data_openlibrary(self, isbn): """ Download data from openlibrary :return True if success """ api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \ "&format=json&jscmd=data".format(isbn) with urllib.request.urlopen(api_url) as url: data = json.loads(url.read().decode()) if data and data['ISBN:' + isbn]: data = data['ISBN:' + isbn] if 'url' in data: # Fill the data self.parse_data_openlibrary(data) return True return False def parse_data_openlibrary(self, data): self.cleaned_data['external_url'] = data['url'] if 'title' in data: self.cleaned_data['title'] = data['title'] if 'subtitle' in data: self.cleaned_data['subtitle'] = data['subtitle'] if 'number_of_pages' in data: self.cleaned_data['number_of_pages'] = \ data['number_of_pages'] elif not self.cleaned_data['number_of_pages']: self.cleaned_data['number_of_pages'] = 0 if 'publish_date' in data: months = ['January', 'February', "March", "April", "Mai", "June", "July", "August", "September", "October", "November", "December"] split = data['publish_date'].replace(',', '').split(' ') if len(split) == 1: self.cleaned_data['publish_date'] = split[0] + "-01-01" else: month_to_number = dict( Jan="01", Feb="02", Mar="03", Apr="04", May="05", Jun="06", Jul="07", Aug="08", Sep="09", Oct="10", Nov="11", Dec="12", ) if split[0][:3] in month_to_number: self.cleaned_data['publish_date']\ = split[2] + "-" \ + month_to_number[split[0][:3]] + "-" + split[1] else: self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \ .format(split[2], months.index(split[0]) + 1, int(split[1]), ) if 'authors' not in self.cleaned_data \ or not self.cleaned_data['authors']: self.cleaned_data['authors'] = list() if 'authors' in data: for author in data['authors']: author_obj = Auteur.objects.get_or_create( name=author['name'])[0] self.cleaned_data['authors'].append(author_obj) def clean(self): """ If user fetch ISBN data, then download data before validating the form """ super().clean() if "_isbn" in self.data\ or "_isbn_addanother" in self.data: isbn = self.cleaned_data.get('isbn') if "_isbn_addanother" in self.data: self.data = self.data.copy() self.data['_addanother'] = 42 self.request.POST = self.data if isbn: scrap_result = self.download_data_isbndb(isbn) if not scrap_result: # ISBN is present, try with bedeteque scrap_result = self.download_data_bedeteque(isbn) if not scrap_result: # Try with Google scrap_result = self.download_data_google(isbn) if not scrap_result: # Try with OpenLibrary if not self.download_data_openlibrary(isbn): self.add_error('isbn', _("This ISBN is not found.")) return self.cleaned_data if self.cleaned_data['title']: self.cleaned_data['title'] = re.sub( r'\(AUT\) ', '', self.cleaned_data['title'] ) if self.cleaned_data['authors']: side_identifier = generate_side_identifier( self.cleaned_data["title"], self.cleaned_data["authors"], self.cleaned_data["subtitle"], ) self.cleaned_data['side_identifier'] = side_identifier return self.cleaned_data def _clean_fields(self): for name, field in self.fields.items(): # value_from_datadict() gets the data from the data dictionaries. # Each widget type knows how to retrieve its own data, because some # widgets split data over several HTML fields. if field.disabled: value = self.get_initial_for_field(field, name) else: value = field.widget.value_from_datadict( self.data, self.files, self.add_prefix(name)) from django.core.exceptions import ValidationError try: # We don't want to check a field when we enter an ISBN. if "isbn" not in self.data \ or not self.cleaned_data.get('isbn'): value = field.clean(value) self.cleaned_data[name] = value if hasattr(self, 'clean_%s' % name): value = getattr(self, 'clean_%s' % name)() self.cleaned_data[name] = value except ValidationError as e: self.add_error(name, e) class Meta: model = BD fields = ('isbn', 'title', 'subtitle', 'external_url', 'side_identifier', 'authors', 'number_of_pages', 'publish_date', 'present', )