1
0
mirror of https://gitlab.crans.org/mediatek/med.git synced 2024-11-26 18:07:10 +00:00
med/media/forms.py
2020-10-03 15:36:28 +02:00

323 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- mode: python; coding: utf-8 -*-
# Copyright (C) 2017-2020 by BDE ENS Paris-Saclay
# SPDX-License-Identifier: GPL-3.0-or-later
import json
import os
import re
import unicodedata
import urllib.request
from django.db.models import QuerySet
from django.forms import ModelForm
from django.utils.translation import gettext_lazy as _
from .models import Auteur, BD
from .scraper import BedetequeScraper
def generate_side_identifier(title, authors, subtitle=None):
if isinstance(authors, QuerySet):
authors = list(authors)
title_normalized = title.upper()
title_normalized = title_normalized.replace('', '\'')
title_normalized = re.sub(r'^DE ', '', title_normalized)
title_normalized = re.sub(r'^DES ', '', title_normalized)
title_normalized = re.sub(r'^LE ', '', title_normalized)
title_normalized = re.sub(r'^LA ', '', title_normalized)
title_normalized = re.sub(r'^LES ', '', title_normalized)
title_normalized = re.sub(r'^L\'', '', title_normalized)
title_normalized = re.sub(r'^UN ', '', title_normalized)
title_normalized = re.sub(r'^UNE ', '', title_normalized)
title_normalized = re.sub(r'^THE ', '', title_normalized)
title_normalized = re.sub(r'Œ', 'OE', title_normalized)
title_normalized = title_normalized.replace(' ', '')
title_normalized = ''.join(
char
for char in unicodedata.normalize(
'NFKD', title_normalized.casefold())
if all(not unicodedata.category(char).startswith(cat)
for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
).casefold().upper()
title_normalized = re.sub("[^A-Z0-9$]", "", title_normalized)
authors = authors.copy()
def sort(author):
return "{:042d}".format(-author.note) + author.name.split(" ")[-1] + ".{:042d}".format(author.pk)
authors.sort(key=sort)
primary_author = authors[0]
author_name = primary_author.name.upper()
if ',' not in author_name and ' ' in author_name:
author_name = author_name.split(' ')[-1]
author_name = ''.join(
char for char in unicodedata.normalize('NFKD', author_name.casefold())
if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
).casefold().upper()
author_name = re.sub("[^A-Z]", "", author_name)
side_identifier = "{:.3} {:.3}".format(author_name, title_normalized, )
if subtitle:
subtitle = re.sub(r'</span>', '', subtitle)
subtitle = re.sub(r'<span.*>', '', subtitle)
start = subtitle.split(' ')[0].replace('.', '')
start = re.sub("^R?", "", start)
if start.isnumeric():
side_identifier += " {:0>2}".format(start, )
# Normalize side identifier, in order to remove accents
side_identifier = ''.join(char for char in unicodedata.normalize('NFKD', side_identifier.casefold())
if all(not unicodedata.category(char).startswith(cat) for cat in {'M', 'P', 'Z', 'C'})
or char == ' ').casefold().upper()
return side_identifier
class MediaAdminForm(ModelForm):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
isbn_field = self.fields.get('isbn')
if isbn_field:
isbn_field.widget.template_name = "media/isbn_button.html"
isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
side_identifier_field = self.fields.get('side_identifier')
if side_identifier_field and self.instance and self.instance.pk:
instance = self.instance
title, authors, subtitle = instance.title, instance.authors.all(), None
if hasattr(instance, "subtitle"):
subtitle = instance.subtitle
side_identifier_field.widget.attrs.update(
{'data-generated-side-identifier': generate_side_identifier(title, authors, subtitle)})
side_identifier_field.widget.template_name = "media/generate_side_identifier.html"
def download_data_isbndb(self, isbn):
api_url = "https://api2.isbndb.com/book/" + str(isbn) + "?Authorization=" + os.getenv("ISBNDB_KEY")
with urllib.request.urlopen(api_url) as url:
data = json.loads(url.read().decode())
self.cleaned_data["title"] = data["title_long"]
self.cleaned_data["published_date"] = data["date_published"][:10]
self.cleaned_data["number_of_pages"] = data["pages"]
self.cleaned_data["authors"] = \
list(Auteur.objects.get_or_create(name=author_name)[0] for author_name in data["authors"])
self.cleaned_data["external_url"] = data["image"]
return True
def download_data_bedeteque(self, isbn):
"""
Download data from bedeteque
:return True if success
"""
scraper = BedetequeScraper()
r = scraper.search_by_isbn(isbn)
if not r:
return False
# If results, then take the most accurate
data = scraper.scrap_bd_info(r[0])
self.cleaned_data.update(data)
return True
def download_data_google(self, isbn):
"""
Download data from google books
:return True if success
"""
api_url = "https://www.googleapis.com/books/v1/volumes?q=ISBN:{}"\
.format(isbn)
with urllib.request.urlopen(api_url) as url:
data = json.loads(url.read().decode())
if data and data['totalItems']:
fetched_item = None
for item in data['items']:
for identifiers in item["volumeInfo"]["industryIdentifiers"]:
if identifiers["identifier"] == isbn:
fetched_item = item
break
if fetched_item:
break
if not fetched_item:
return False
# Fill the data
self.parse_data_google(fetched_item)
return True
return False
def parse_data_google(self, data):
info = data['volumeInfo']
self.cleaned_data['external_url'] = info['canonicalVolumeLink']
if 'title' in info:
self.cleaned_data['title'] = info['title']
if 'subtitle' in data:
self.cleaned_data['subtitle'] = info['subtitle']
if 'pageCount' in info:
self.cleaned_data['number_of_pages'] = \
info['pageCount']
elif not self.cleaned_data['number_of_pages']:
self.cleaned_data['number_of_pages'] = 0
if 'publishedDate' in info:
if "-" not in info["publishedDate"]:
info["publishedDate"] += "-01-01"
elif len(info["publishedDate"]) == 7:
info["publishedDate"] += "-01"
self.cleaned_data['publish_date'] = info['publishedDate'][:10]
if 'authors' not in self.cleaned_data \
or not self.cleaned_data['authors']:
self.cleaned_data['authors'] = list()
if 'authors' in info:
for author in info['authors']:
author_obj = Auteur.objects.get_or_create(
name=author)[0]
self.cleaned_data['authors'].append(author_obj)
print(self.cleaned_data)
def download_data_openlibrary(self, isbn):
"""
Download data from openlibrary
:return True if success
"""
api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
"&format=json&jscmd=data".format(isbn)
with urllib.request.urlopen(api_url) as url:
data = json.loads(url.read().decode())
if data and data['ISBN:' + isbn]:
data = data['ISBN:' + isbn]
if 'url' in data:
# Fill the data
self.parse_data_openlibrary(data)
return True
return False
def parse_data_openlibrary(self, data):
self.cleaned_data['external_url'] = data['url']
if 'title' in data:
self.cleaned_data['title'] = data['title']
if 'subtitle' in data:
self.cleaned_data['subtitle'] = data['subtitle']
if 'number_of_pages' in data:
self.cleaned_data['number_of_pages'] = \
data['number_of_pages']
elif not self.cleaned_data['number_of_pages']:
self.cleaned_data['number_of_pages'] = 0
if 'publish_date' in data:
months = ['January', 'February', "March", "April", "Mai",
"June", "July", "August", "September",
"October", "November", "December"]
split = data['publish_date'].replace(',', '').split(' ')
if len(split) == 1:
self.cleaned_data['publish_date'] = split[0] + "-01-01"
else:
month_to_number = dict(
Jan="01",
Feb="02",
Mar="03",
Apr="04",
May="05",
Jun="06",
Jul="07",
Aug="08",
Sep="09",
Oct="10",
Nov="11",
Dec="12",
)
if split[0][:3] in month_to_number:
self.cleaned_data['publish_date']\
= split[2] + "-" \
+ month_to_number[split[0][:3]] + "-" + split[1]
else:
self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \
.format(split[2], months.index(split[0])
+ 1, int(split[1]), )
if 'authors' not in self.cleaned_data \
or not self.cleaned_data['authors']:
self.cleaned_data['authors'] = list()
if 'authors' in data:
for author in data['authors']:
author_obj = Auteur.objects.get_or_create(
name=author['name'])[0]
self.cleaned_data['authors'].append(author_obj)
def clean(self):
"""
If user fetch ISBN data, then download data before validating the form
"""
super().clean()
if "_isbn" in self.data\
or "_isbn_addanother" in self.data:
isbn = self.cleaned_data.get('isbn')
if "_isbn_addanother" in self.data:
self.data = self.data.copy()
self.data['_addanother'] = 42
self.request.POST = self.data
if isbn:
scrap_result = self.download_data_isbndb(isbn)
if not scrap_result:
# ISBN is present, try with bedeteque
scrap_result = self.download_data_bedeteque(isbn)
if not scrap_result:
# Try with Google
scrap_result = self.download_data_google(isbn)
if not scrap_result:
# Try with OpenLibrary
if not self.download_data_openlibrary(isbn):
self.add_error('isbn',
_("This ISBN is not found."))
return self.cleaned_data
if self.cleaned_data['title']:
self.cleaned_data['title'] = re.sub(
r'\(AUT\) ',
'',
self.cleaned_data['title']
)
if self.cleaned_data['authors']:
side_identifier = generate_side_identifier(
self.cleaned_data["title"],
self.cleaned_data["authors"],
self.cleaned_data["subtitle"],
)
self.cleaned_data['side_identifier'] = side_identifier
return self.cleaned_data
def _clean_fields(self):
for name, field in self.fields.items():
# value_from_datadict() gets the data from the data dictionaries.
# Each widget type knows how to retrieve its own data, because some
# widgets split data over several HTML fields.
if field.disabled:
value = self.get_initial_for_field(field, name)
else:
value = field.widget.value_from_datadict(
self.data, self.files, self.add_prefix(name))
from django.core.exceptions import ValidationError
try:
# We don't want to check a field when we enter an ISBN.
if "isbn" not in self.data \
or not self.cleaned_data.get('isbn'):
value = field.clean(value)
self.cleaned_data[name] = value
if hasattr(self, 'clean_%s' % name):
value = getattr(self, 'clean_%s' % name)()
self.cleaned_data[name] = value
except ValidationError as e:
self.add_error(name, e)
class Meta:
model = BD
fields = '__all__'