2019-08-02 12:57:53 +00:00
|
|
|
|
# -*- mode: python; coding: utf-8 -*-
|
2020-09-28 08:51:21 +00:00
|
|
|
|
# Copyright (C) 2017-2020 by BDE ENS Paris-Saclay
|
2019-08-02 12:57:53 +00:00
|
|
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
|
|
2019-08-11 08:40:39 +00:00
|
|
|
|
import json
|
2020-10-03 13:36:28 +00:00
|
|
|
|
import os
|
2020-02-10 16:51:09 +00:00
|
|
|
|
import re
|
2020-02-10 15:47:05 +00:00
|
|
|
|
import unicodedata
|
2020-10-03 13:51:25 +00:00
|
|
|
|
from urllib.error import HTTPError
|
2020-02-10 20:12:12 +00:00
|
|
|
|
import urllib.request
|
2019-08-11 08:40:39 +00:00
|
|
|
|
|
2020-09-28 08:51:21 +00:00
|
|
|
|
from django.db.models import QuerySet
|
2019-08-02 12:57:53 +00:00
|
|
|
|
from django.forms import ModelForm
|
2020-05-12 15:08:40 +00:00
|
|
|
|
from django.utils.translation import gettext_lazy as _
|
2019-08-02 12:57:53 +00:00
|
|
|
|
|
2020-05-22 16:04:41 +00:00
|
|
|
|
from .models import Auteur, BD
|
2019-08-19 10:17:49 +00:00
|
|
|
|
from .scraper import BedetequeScraper
|
|
|
|
|
|
2019-08-11 08:40:39 +00:00
|
|
|
|
|
2020-09-28 08:51:21 +00:00
|
|
|
|
def generate_side_identifier(title, authors, subtitle=None):
|
|
|
|
|
if isinstance(authors, QuerySet):
|
|
|
|
|
authors = list(authors)
|
|
|
|
|
|
|
|
|
|
title_normalized = title.upper()
|
|
|
|
|
title_normalized = title_normalized.replace('’', '\'')
|
|
|
|
|
title_normalized = re.sub(r'^DE ', '', title_normalized)
|
2020-10-03 09:03:22 +00:00
|
|
|
|
title_normalized = re.sub(r'^DES ', '', title_normalized)
|
2020-09-28 08:51:21 +00:00
|
|
|
|
title_normalized = re.sub(r'^LE ', '', title_normalized)
|
|
|
|
|
title_normalized = re.sub(r'^LA ', '', title_normalized)
|
|
|
|
|
title_normalized = re.sub(r'^LES ', '', title_normalized)
|
|
|
|
|
title_normalized = re.sub(r'^L\'', '', title_normalized)
|
2020-10-02 14:20:24 +00:00
|
|
|
|
title_normalized = re.sub(r'^UN ', '', title_normalized)
|
|
|
|
|
title_normalized = re.sub(r'^UNE ', '', title_normalized)
|
2020-09-28 08:51:21 +00:00
|
|
|
|
title_normalized = re.sub(r'^THE ', '', title_normalized)
|
|
|
|
|
title_normalized = re.sub(r'Œ', 'OE', title_normalized)
|
|
|
|
|
title_normalized = title_normalized.replace(' ', '')
|
2020-10-02 11:54:16 +00:00
|
|
|
|
title_normalized = ''.join(
|
|
|
|
|
char
|
|
|
|
|
for char in unicodedata.normalize(
|
|
|
|
|
'NFKD', title_normalized.casefold())
|
|
|
|
|
if all(not unicodedata.category(char).startswith(cat)
|
|
|
|
|
for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
|
|
|
|
|
).casefold().upper()
|
2020-10-02 14:59:58 +00:00
|
|
|
|
title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized)
|
2020-09-28 08:51:21 +00:00
|
|
|
|
authors = authors.copy()
|
|
|
|
|
|
|
|
|
|
def sort(author):
|
2020-10-02 11:12:40 +00:00
|
|
|
|
return "{:042d}".format(-author.note) + author.name.split(" ")[-1] + ".{:042d}".format(author.pk)
|
2020-09-28 08:51:21 +00:00
|
|
|
|
|
|
|
|
|
authors.sort(key=sort)
|
|
|
|
|
primary_author = authors[0]
|
|
|
|
|
author_name = primary_author.name.upper()
|
|
|
|
|
if ',' not in author_name and ' ' in author_name:
|
|
|
|
|
author_name = author_name.split(' ')[-1]
|
|
|
|
|
author_name = ''.join(
|
|
|
|
|
char for char in unicodedata.normalize('NFKD', author_name.casefold())
|
|
|
|
|
if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
|
|
|
|
|
).casefold().upper()
|
|
|
|
|
author_name = re.sub("[^A-Z]", "", author_name)
|
|
|
|
|
side_identifier = "{:.3} {:.3}".format(author_name, title_normalized, )
|
|
|
|
|
if subtitle:
|
|
|
|
|
subtitle = re.sub(r'</span>', '', subtitle)
|
|
|
|
|
subtitle = re.sub(r'<span.*>', '', subtitle)
|
|
|
|
|
start = subtitle.split(' ')[0].replace('.', '')
|
2020-10-02 14:43:04 +00:00
|
|
|
|
start = re.sub("^R?", "", start)
|
2020-09-28 08:51:21 +00:00
|
|
|
|
|
|
|
|
|
if start.isnumeric():
|
|
|
|
|
side_identifier += " {:0>2}".format(start, )
|
|
|
|
|
|
|
|
|
|
# Normalize side identifier, in order to remove accents
|
|
|
|
|
side_identifier = ''.join(char for char in unicodedata.normalize('NFKD', side_identifier.casefold())
|
|
|
|
|
if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'})
|
|
|
|
|
or char == ' ').casefold().upper()
|
|
|
|
|
|
|
|
|
|
return side_identifier
|
|
|
|
|
|
|
|
|
|
|
2019-08-11 08:40:39 +00:00
|
|
|
|
class MediaAdminForm(ModelForm):
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
|
super().__init__(*args, **kwargs)
|
2019-08-15 14:30:44 +00:00
|
|
|
|
isbn_field = self.fields.get('isbn')
|
|
|
|
|
if isbn_field:
|
|
|
|
|
isbn_field.widget.template_name = "media/isbn_button.html"
|
|
|
|
|
isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
|
2020-10-02 11:12:40 +00:00
|
|
|
|
side_identifier_field = self.fields.get('side_identifier')
|
|
|
|
|
if side_identifier_field and self.instance and self.instance.pk:
|
|
|
|
|
instance = self.instance
|
|
|
|
|
title, authors, subtitle = instance.title, instance.authors.all(), None
|
|
|
|
|
if hasattr(instance, "subtitle"):
|
|
|
|
|
subtitle = instance.subtitle
|
|
|
|
|
side_identifier_field.widget.attrs.update(
|
|
|
|
|
{'data-generated-side-identifier': generate_side_identifier(title, authors, subtitle)})
|
|
|
|
|
side_identifier_field.widget.template_name = "media/generate_side_identifier.html"
|
2019-08-11 08:40:39 +00:00
|
|
|
|
|
2020-10-03 13:36:28 +00:00
|
|
|
|
def download_data_isbndb(self, isbn):
|
|
|
|
|
api_url = "https://api2.isbndb.com/book/" + str(isbn) + "?Authorization=" + os.getenv("ISBNDB_KEY")
|
2020-10-03 13:40:33 +00:00
|
|
|
|
req = urllib.request.Request(api_url)
|
|
|
|
|
req.add_header("Authorization", os.getenv("ISBNDB_KEY"))
|
2020-10-03 13:51:25 +00:00
|
|
|
|
try:
|
|
|
|
|
with urllib.request.urlopen(req) as url:
|
|
|
|
|
data: dict = json.loads(url.read().decode())["book"]
|
|
|
|
|
except HTTPError:
|
|
|
|
|
return False
|
2020-10-03 13:41:49 +00:00
|
|
|
|
print(data)
|
2020-10-03 13:47:22 +00:00
|
|
|
|
data.setdefault("title", "")
|
|
|
|
|
data.setdefault("date_published", "1970-01-01")
|
|
|
|
|
data.setdefault("pages", 0)
|
|
|
|
|
data.setdefault("authors", [])
|
|
|
|
|
data.setdefault("image", "")
|
2020-10-03 13:41:49 +00:00
|
|
|
|
self.cleaned_data["title"] = data["title"]
|
2020-10-03 13:55:56 +00:00
|
|
|
|
self.cleaned_data["publish_date"] = data["date_published"][:10]
|
|
|
|
|
while len(self.cleaned_data["publish_date"]) == 4 or len(self.cleaned_data["publish_date"]) == 7:
|
|
|
|
|
self.cleaned_data["publish_date"] += "-01"
|
2020-10-03 13:36:28 +00:00
|
|
|
|
self.cleaned_data["number_of_pages"] = data["pages"]
|
|
|
|
|
self.cleaned_data["authors"] = \
|
|
|
|
|
list(Auteur.objects.get_or_create(name=author_name)[0] for author_name in data["authors"])
|
|
|
|
|
self.cleaned_data["external_url"] = data["image"]
|
|
|
|
|
return True
|
|
|
|
|
|
2019-08-19 10:17:49 +00:00
|
|
|
|
def download_data_bedeteque(self, isbn):
|
|
|
|
|
"""
|
|
|
|
|
Download data from bedeteque
|
|
|
|
|
:return True if success
|
2019-08-11 08:40:39 +00:00
|
|
|
|
"""
|
2019-08-19 10:17:49 +00:00
|
|
|
|
scraper = BedetequeScraper()
|
|
|
|
|
r = scraper.search_by_isbn(isbn)
|
|
|
|
|
if not r:
|
|
|
|
|
return False
|
|
|
|
|
# If results, then take the most accurate
|
|
|
|
|
data = scraper.scrap_bd_info(r[0])
|
|
|
|
|
self.cleaned_data.update(data)
|
|
|
|
|
return True
|
|
|
|
|
|
2020-05-12 15:08:40 +00:00
|
|
|
|
def download_data_google(self, isbn):
|
|
|
|
|
"""
|
|
|
|
|
Download data from google books
|
|
|
|
|
:return True if success
|
|
|
|
|
"""
|
2020-05-12 15:34:53 +00:00
|
|
|
|
api_url = "https://www.googleapis.com/books/v1/volumes?q=ISBN:{}"\
|
|
|
|
|
.format(isbn)
|
2020-05-12 15:08:40 +00:00
|
|
|
|
with urllib.request.urlopen(api_url) as url:
|
|
|
|
|
data = json.loads(url.read().decode())
|
|
|
|
|
|
|
|
|
|
if data and data['totalItems']:
|
2020-10-03 13:10:43 +00:00
|
|
|
|
fetched_item = None
|
|
|
|
|
for item in data['items']:
|
|
|
|
|
for identifiers in item["volumeInfo"]["industryIdentifiers"]:
|
|
|
|
|
if identifiers["identifier"] == isbn:
|
|
|
|
|
fetched_item = item
|
|
|
|
|
break
|
|
|
|
|
if fetched_item:
|
|
|
|
|
break
|
|
|
|
|
if not fetched_item:
|
|
|
|
|
return False
|
2020-05-12 15:08:40 +00:00
|
|
|
|
# Fill the data
|
2020-10-03 13:10:43 +00:00
|
|
|
|
self.parse_data_google(fetched_item)
|
2020-05-12 15:08:40 +00:00
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def parse_data_google(self, data):
|
|
|
|
|
info = data['volumeInfo']
|
|
|
|
|
self.cleaned_data['external_url'] = info['canonicalVolumeLink']
|
|
|
|
|
if 'title' in info:
|
|
|
|
|
self.cleaned_data['title'] = info['title']
|
|
|
|
|
if 'subtitle' in data:
|
|
|
|
|
self.cleaned_data['subtitle'] = info['subtitle']
|
|
|
|
|
|
|
|
|
|
if 'pageCount' in info:
|
|
|
|
|
self.cleaned_data['number_of_pages'] = \
|
|
|
|
|
info['pageCount']
|
|
|
|
|
elif not self.cleaned_data['number_of_pages']:
|
|
|
|
|
self.cleaned_data['number_of_pages'] = 0
|
|
|
|
|
|
|
|
|
|
if 'publishedDate' in info:
|
2020-10-02 15:12:02 +00:00
|
|
|
|
if "-" not in info["publishedDate"]:
|
|
|
|
|
info["publishedDate"] += "-01-01"
|
2020-10-03 13:15:35 +00:00
|
|
|
|
elif len(info["publishedDate"]) == 7:
|
|
|
|
|
info["publishedDate"] += "-01"
|
2020-10-02 15:42:56 +00:00
|
|
|
|
self.cleaned_data['publish_date'] = info['publishedDate'][:10]
|
2020-05-12 15:08:40 +00:00
|
|
|
|
|
2020-05-21 15:07:50 +00:00
|
|
|
|
if 'authors' not in self.cleaned_data \
|
|
|
|
|
or not self.cleaned_data['authors']:
|
2020-05-12 15:08:40 +00:00
|
|
|
|
self.cleaned_data['authors'] = list()
|
|
|
|
|
|
|
|
|
|
if 'authors' in info:
|
|
|
|
|
for author in info['authors']:
|
|
|
|
|
author_obj = Auteur.objects.get_or_create(
|
|
|
|
|
name=author)[0]
|
|
|
|
|
self.cleaned_data['authors'].append(author_obj)
|
|
|
|
|
|
|
|
|
|
print(self.cleaned_data)
|
|
|
|
|
|
2019-08-19 10:17:49 +00:00
|
|
|
|
def download_data_openlibrary(self, isbn):
|
|
|
|
|
"""
|
|
|
|
|
Download data from openlibrary
|
|
|
|
|
:return True if success
|
2019-08-11 08:40:39 +00:00
|
|
|
|
"""
|
|
|
|
|
api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
|
|
|
|
|
"&format=json&jscmd=data".format(isbn)
|
|
|
|
|
with urllib.request.urlopen(api_url) as url:
|
|
|
|
|
data = json.loads(url.read().decode())
|
2020-02-10 10:47:59 +00:00
|
|
|
|
|
2019-08-11 08:40:39 +00:00
|
|
|
|
if data and data['ISBN:' + isbn]:
|
|
|
|
|
data = data['ISBN:' + isbn]
|
2019-08-11 09:19:24 +00:00
|
|
|
|
if 'url' in data:
|
2019-08-19 10:17:49 +00:00
|
|
|
|
# Fill the data
|
2020-02-10 11:08:48 +00:00
|
|
|
|
self.parse_data_openlibrary(data)
|
|
|
|
|
return True
|
|
|
|
|
return False
|
2020-02-10 10:47:59 +00:00
|
|
|
|
|
2020-02-10 11:08:48 +00:00
|
|
|
|
def parse_data_openlibrary(self, data):
|
|
|
|
|
self.cleaned_data['external_url'] = data['url']
|
|
|
|
|
if 'title' in data:
|
|
|
|
|
self.cleaned_data['title'] = data['title']
|
|
|
|
|
if 'subtitle' in data:
|
|
|
|
|
self.cleaned_data['subtitle'] = data['subtitle']
|
2020-02-10 10:47:59 +00:00
|
|
|
|
|
2020-02-10 11:08:48 +00:00
|
|
|
|
if 'number_of_pages' in data:
|
|
|
|
|
self.cleaned_data['number_of_pages'] = \
|
|
|
|
|
data['number_of_pages']
|
|
|
|
|
elif not self.cleaned_data['number_of_pages']:
|
|
|
|
|
self.cleaned_data['number_of_pages'] = 0
|
2020-02-10 10:47:59 +00:00
|
|
|
|
|
2020-02-10 11:08:48 +00:00
|
|
|
|
if 'publish_date' in data:
|
|
|
|
|
months = ['January', 'February', "March", "April", "Mai",
|
|
|
|
|
"June", "July", "August", "September",
|
|
|
|
|
"October", "November", "December"]
|
|
|
|
|
split = data['publish_date'].replace(',', '').split(' ')
|
2020-02-10 18:33:34 +00:00
|
|
|
|
if len(split) == 1:
|
|
|
|
|
self.cleaned_data['publish_date'] = split[0] + "-01-01"
|
|
|
|
|
else:
|
2020-05-12 12:56:31 +00:00
|
|
|
|
month_to_number = dict(
|
|
|
|
|
Jan="01",
|
|
|
|
|
Feb="02",
|
|
|
|
|
Mar="03",
|
|
|
|
|
Apr="04",
|
|
|
|
|
May="05",
|
|
|
|
|
Jun="06",
|
|
|
|
|
Jul="07",
|
|
|
|
|
Aug="08",
|
|
|
|
|
Sep="09",
|
|
|
|
|
Oct="10",
|
|
|
|
|
Nov="11",
|
|
|
|
|
Dec="12",
|
|
|
|
|
)
|
2020-05-12 15:08:40 +00:00
|
|
|
|
if split[0][:3] in month_to_number:
|
2020-05-12 12:56:31 +00:00
|
|
|
|
self.cleaned_data['publish_date']\
|
|
|
|
|
= split[2] + "-" \
|
2020-05-12 15:08:40 +00:00
|
|
|
|
+ month_to_number[split[0][:3]] + "-" + split[1]
|
2020-05-12 12:56:31 +00:00
|
|
|
|
else:
|
|
|
|
|
self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \
|
|
|
|
|
.format(split[2], months.index(split[0])
|
|
|
|
|
+ 1, int(split[1]), )
|
2020-02-10 11:08:48 +00:00
|
|
|
|
|
2020-05-21 15:07:50 +00:00
|
|
|
|
if 'authors' not in self.cleaned_data \
|
|
|
|
|
or not self.cleaned_data['authors']:
|
2020-05-12 15:08:40 +00:00
|
|
|
|
self.cleaned_data['authors'] = list()
|
|
|
|
|
|
2020-02-10 11:08:48 +00:00
|
|
|
|
if 'authors' in data:
|
|
|
|
|
for author in data['authors']:
|
|
|
|
|
author_obj = Auteur.objects.get_or_create(
|
|
|
|
|
name=author['name'])[0]
|
|
|
|
|
self.cleaned_data['authors'].append(author_obj)
|
2019-08-11 08:40:39 +00:00
|
|
|
|
|
|
|
|
|
def clean(self):
|
|
|
|
|
"""
|
|
|
|
|
If user fetch ISBN data, then download data before validating the form
|
|
|
|
|
"""
|
2020-02-10 02:08:08 +00:00
|
|
|
|
super().clean()
|
|
|
|
|
|
2020-05-21 14:56:41 +00:00
|
|
|
|
if "_isbn" in self.data\
|
|
|
|
|
or "_isbn_addanother" in self.data:
|
2019-08-11 08:40:39 +00:00
|
|
|
|
isbn = self.cleaned_data.get('isbn')
|
2020-05-21 14:56:41 +00:00
|
|
|
|
if "_isbn_addanother" in self.data:
|
|
|
|
|
self.data = self.data.copy()
|
|
|
|
|
self.data['_addanother'] = 42
|
2020-05-22 19:08:44 +00:00
|
|
|
|
self.request.POST = self.data
|
2019-08-11 08:40:39 +00:00
|
|
|
|
if isbn:
|
2020-10-03 13:36:28 +00:00
|
|
|
|
scrap_result = self.download_data_isbndb(isbn)
|
2019-08-19 10:17:49 +00:00
|
|
|
|
if not scrap_result:
|
2020-10-03 13:36:28 +00:00
|
|
|
|
# ISBN is present, try with bedeteque
|
|
|
|
|
scrap_result = self.download_data_bedeteque(isbn)
|
2020-05-12 15:08:40 +00:00
|
|
|
|
if not scrap_result:
|
2020-10-03 13:36:28 +00:00
|
|
|
|
# Try with Google
|
|
|
|
|
scrap_result = self.download_data_google(isbn)
|
|
|
|
|
if not scrap_result:
|
|
|
|
|
# Try with OpenLibrary
|
|
|
|
|
if not self.download_data_openlibrary(isbn):
|
|
|
|
|
self.add_error('isbn',
|
|
|
|
|
_("This ISBN is not found."))
|
|
|
|
|
return self.cleaned_data
|
2019-08-11 08:40:39 +00:00
|
|
|
|
|
2020-02-10 17:23:13 +00:00
|
|
|
|
if self.cleaned_data['title']:
|
2020-02-11 20:12:00 +00:00
|
|
|
|
self.cleaned_data['title'] = re.sub(
|
|
|
|
|
r'\(AUT\) ',
|
|
|
|
|
'',
|
|
|
|
|
self.cleaned_data['title']
|
|
|
|
|
)
|
2020-02-10 17:23:13 +00:00
|
|
|
|
|
2020-02-10 10:29:26 +00:00
|
|
|
|
if self.cleaned_data['authors']:
|
2020-09-28 08:51:21 +00:00
|
|
|
|
side_identifier = generate_side_identifier(
|
|
|
|
|
self.cleaned_data["title"],
|
|
|
|
|
self.cleaned_data["authors"],
|
|
|
|
|
self.cleaned_data["subtitle"],
|
|
|
|
|
)
|
|
|
|
|
|
2020-02-10 10:29:26 +00:00
|
|
|
|
self.cleaned_data['side_identifier'] = side_identifier
|
|
|
|
|
|
2020-02-10 02:08:08 +00:00
|
|
|
|
return self.cleaned_data
|
|
|
|
|
|
|
|
|
|
def _clean_fields(self):
|
|
|
|
|
for name, field in self.fields.items():
|
|
|
|
|
# value_from_datadict() gets the data from the data dictionaries.
|
|
|
|
|
# Each widget type knows how to retrieve its own data, because some
|
|
|
|
|
# widgets split data over several HTML fields.
|
|
|
|
|
if field.disabled:
|
|
|
|
|
value = self.get_initial_for_field(field, name)
|
|
|
|
|
else:
|
2020-02-10 03:59:39 +00:00
|
|
|
|
value = field.widget.value_from_datadict(
|
|
|
|
|
self.data, self.files, self.add_prefix(name))
|
2020-02-10 02:08:08 +00:00
|
|
|
|
from django.core.exceptions import ValidationError
|
|
|
|
|
try:
|
|
|
|
|
# We don't want to check a field when we enter an ISBN.
|
2020-05-21 14:56:41 +00:00
|
|
|
|
if "isbn" not in self.data \
|
2020-02-10 03:59:39 +00:00
|
|
|
|
or not self.cleaned_data.get('isbn'):
|
2020-02-10 02:08:08 +00:00
|
|
|
|
value = field.clean(value)
|
|
|
|
|
self.cleaned_data[name] = value
|
|
|
|
|
if hasattr(self, 'clean_%s' % name):
|
|
|
|
|
value = getattr(self, 'clean_%s' % name)()
|
|
|
|
|
self.cleaned_data[name] = value
|
|
|
|
|
except ValidationError as e:
|
|
|
|
|
self.add_error(name, e)
|
2020-05-21 14:56:41 +00:00
|
|
|
|
|
|
|
|
|
class Meta:
|
2020-05-22 16:04:41 +00:00
|
|
|
|
model = BD
|
2020-05-21 14:56:41 +00:00
|
|
|
|
fields = '__all__'
|