1
0
mirror of https://gitlab.crans.org/mediatek/med.git synced 2024-11-30 12:13:03 +00:00
med/media/forms.py

269 lines
10 KiB
Python
Raw Normal View History

2019-08-02 12:57:53 +00:00
# -*- mode: python; coding: utf-8 -*-
# Copyright (C) 2017-2019 by BDE ENS Paris-Saclay
# SPDX-License-Identifier: GPL-3.0-or-later
2019-08-11 08:40:39 +00:00
import json
import re
import unicodedata
2020-02-10 20:12:12 +00:00
import urllib.request
2019-08-11 08:40:39 +00:00
2019-08-02 12:57:53 +00:00
from django.forms import ModelForm
2020-05-12 15:08:40 +00:00
from django.utils.translation import gettext_lazy as _
2019-08-02 12:57:53 +00:00
2020-05-22 16:04:41 +00:00
from .models import Auteur, BD
2019-08-19 10:17:49 +00:00
from .scraper import BedetequeScraper
2019-08-11 08:40:39 +00:00
class MediaAdminForm(ModelForm):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
2019-08-15 14:30:44 +00:00
isbn_field = self.fields.get('isbn')
if isbn_field:
isbn_field.widget.template_name = "media/isbn_button.html"
isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
2019-08-11 08:40:39 +00:00
2019-08-19 10:17:49 +00:00
def download_data_bedeteque(self, isbn):
"""
Download data from bedeteque
:return True if success
2019-08-11 08:40:39 +00:00
"""
2019-08-19 10:17:49 +00:00
scraper = BedetequeScraper()
r = scraper.search_by_isbn(isbn)
if not r:
return False
# If results, then take the most accurate
data = scraper.scrap_bd_info(r[0])
self.cleaned_data.update(data)
return True
2020-05-12 15:08:40 +00:00
def download_data_google(self, isbn):
"""
Download data from google books
:return True if success
"""
2020-05-12 15:34:53 +00:00
api_url = "https://www.googleapis.com/books/v1/volumes?q=ISBN:{}"\
.format(isbn)
2020-05-12 15:08:40 +00:00
with urllib.request.urlopen(api_url) as url:
data = json.loads(url.read().decode())
if data and data['totalItems']:
data = data['items'][0]
# Fill the data
self.parse_data_google(data)
return True
return False
def parse_data_google(self, data):
info = data['volumeInfo']
self.cleaned_data['external_url'] = info['canonicalVolumeLink']
if 'title' in info:
self.cleaned_data['title'] = info['title']
if 'subtitle' in data:
self.cleaned_data['subtitle'] = info['subtitle']
if 'pageCount' in info:
self.cleaned_data['number_of_pages'] = \
info['pageCount']
elif not self.cleaned_data['number_of_pages']:
self.cleaned_data['number_of_pages'] = 0
if 'publishedDate' in info:
self.cleaned_data['publish_date'] = info['publishedDate']
2020-05-21 15:07:50 +00:00
if 'authors' not in self.cleaned_data \
or not self.cleaned_data['authors']:
2020-05-12 15:08:40 +00:00
self.cleaned_data['authors'] = list()
if 'authors' in info:
for author in info['authors']:
author_obj = Auteur.objects.get_or_create(
name=author)[0]
self.cleaned_data['authors'].append(author_obj)
print(self.cleaned_data)
2019-08-19 10:17:49 +00:00
def download_data_openlibrary(self, isbn):
"""
Download data from openlibrary
:return True if success
2019-08-11 08:40:39 +00:00
"""
api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
"&format=json&jscmd=data".format(isbn)
with urllib.request.urlopen(api_url) as url:
data = json.loads(url.read().decode())
2020-02-10 10:47:59 +00:00
2019-08-11 08:40:39 +00:00
if data and data['ISBN:' + isbn]:
data = data['ISBN:' + isbn]
2019-08-11 09:19:24 +00:00
if 'url' in data:
2019-08-19 10:17:49 +00:00
# Fill the data
self.parse_data_openlibrary(data)
return True
return False
2020-02-10 10:47:59 +00:00
def parse_data_openlibrary(self, data):
self.cleaned_data['external_url'] = data['url']
if 'title' in data:
self.cleaned_data['title'] = data['title']
if 'subtitle' in data:
self.cleaned_data['subtitle'] = data['subtitle']
2020-02-10 10:47:59 +00:00
if 'number_of_pages' in data:
self.cleaned_data['number_of_pages'] = \
data['number_of_pages']
elif not self.cleaned_data['number_of_pages']:
self.cleaned_data['number_of_pages'] = 0
2020-02-10 10:47:59 +00:00
if 'publish_date' in data:
months = ['January', 'February', "March", "April", "Mai",
"June", "July", "August", "September",
"October", "November", "December"]
split = data['publish_date'].replace(',', '').split(' ')
if len(split) == 1:
self.cleaned_data['publish_date'] = split[0] + "-01-01"
else:
2020-05-12 12:56:31 +00:00
month_to_number = dict(
Jan="01",
Feb="02",
Mar="03",
Apr="04",
May="05",
Jun="06",
Jul="07",
Aug="08",
Sep="09",
Oct="10",
Nov="11",
Dec="12",
)
2020-05-12 15:08:40 +00:00
if split[0][:3] in month_to_number:
2020-05-12 12:56:31 +00:00
self.cleaned_data['publish_date']\
= split[2] + "-" \
2020-05-12 15:08:40 +00:00
+ month_to_number[split[0][:3]] + "-" + split[1]
2020-05-12 12:56:31 +00:00
else:
self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \
.format(split[2], months.index(split[0])
+ 1, int(split[1]), )
2020-05-21 15:07:50 +00:00
if 'authors' not in self.cleaned_data \
or not self.cleaned_data['authors']:
2020-05-12 15:08:40 +00:00
self.cleaned_data['authors'] = list()
if 'authors' in data:
for author in data['authors']:
author_obj = Auteur.objects.get_or_create(
name=author['name'])[0]
self.cleaned_data['authors'].append(author_obj)
2019-08-11 08:40:39 +00:00
def clean(self):
"""
If user fetch ISBN data, then download data before validating the form
"""
super().clean()
2020-05-21 14:56:41 +00:00
if "_isbn" in self.data\
or "_isbn_addanother" in self.data:
2019-08-11 08:40:39 +00:00
isbn = self.cleaned_data.get('isbn')
2020-05-21 14:56:41 +00:00
if "_isbn_addanother" in self.data:
self.data = self.data.copy()
self.data['_addanother'] = 42
2020-05-22 19:08:44 +00:00
self.request.POST = self.data
2019-08-11 08:40:39 +00:00
if isbn:
2019-08-19 10:17:49 +00:00
# ISBN is present, try with bedeteque
scrap_result = self.download_data_bedeteque(isbn)
if not scrap_result:
2020-05-21 14:56:41 +00:00
# Try with Google
2020-05-12 15:08:40 +00:00
scrap_result = self.download_data_google(isbn)
if not scrap_result:
# Try with OpenLibrary
if not self.download_data_openlibrary(isbn):
2020-05-12 15:34:53 +00:00
self.add_error('isbn',
_("This ISBN is not found."))
2020-05-12 15:08:40 +00:00
return self.cleaned_data
2019-08-11 08:40:39 +00:00
2020-02-10 17:23:13 +00:00
if self.cleaned_data['title']:
2020-02-11 20:12:00 +00:00
self.cleaned_data['title'] = re.sub(
r'\(AUT\) ',
'',
self.cleaned_data['title']
)
2020-02-10 17:23:13 +00:00
2020-02-10 10:29:26 +00:00
if self.cleaned_data['authors']:
2020-02-10 16:29:10 +00:00
authors = self.cleaned_data['authors']
2020-02-10 16:35:07 +00:00
old_authors = authors.copy()
2020-02-10 20:12:12 +00:00
2020-02-10 16:27:30 +00:00
def sort(author):
2020-02-10 16:35:35 +00:00
return str(-author.note) + "." \
+ str(old_authors.index(author)) \
+ "." + author.name
2020-02-10 20:12:12 +00:00
2020-02-10 16:27:30 +00:00
authors.sort(key=sort)
2020-02-10 10:29:26 +00:00
author_name = self.cleaned_data['authors'][0].name
if ',' not in author_name and ' ' in author_name:
author_name = author_name.split(' ')[-1]
title_normalized = self.cleaned_data['title'].upper()
title_normalized = re.sub(r'^LE ', '', title_normalized)
title_normalized = re.sub(r'^LA ', '', title_normalized)
title_normalized = re.sub(r'^LES ', '', title_normalized)
title_normalized = re.sub(r'^L\'', '', title_normalized)
title_normalized = re.sub(r'^THE ', '', title_normalized)
2020-02-10 17:18:42 +00:00
title_normalized = re.sub(r'Œ', 'OE', title_normalized)
2020-02-10 10:29:26 +00:00
side_identifier = "{:.3} {:.3}".format(
author_name,
title_normalized.replace(' ', ''), )
2020-02-10 10:29:26 +00:00
if self.cleaned_data['subtitle']:
2020-02-11 20:12:00 +00:00
self.cleaned_data['subtitle'] = re.sub(
r'</span>',
'',
self.cleaned_data['subtitle']
)
self.cleaned_data['subtitle'] = re.sub(
r'<span.*>',
'',
self.cleaned_data['subtitle']
)
2020-02-10 10:47:59 +00:00
start = self.cleaned_data['subtitle'].split(' ')[0] \
2020-02-11 20:12:00 +00:00
.replace('.', '')
2020-02-10 10:29:26 +00:00
if start.isnumeric():
side_identifier += " {:0>2}".format(start, )
# Normalize side identifier, in order to remove accents
side_identifier = ''.join(
char
2020-02-10 16:07:06 +00:00
for char in unicodedata.normalize(
'NFKD', side_identifier.casefold())
if all(not unicodedata.category(char).startswith(cat)
for cat in {'M', 'P', 'Z', 'C'}) or char == ' '
).casefold().upper()
2020-02-10 10:29:26 +00:00
self.cleaned_data['side_identifier'] = side_identifier
return self.cleaned_data
def _clean_fields(self):
for name, field in self.fields.items():
# value_from_datadict() gets the data from the data dictionaries.
# Each widget type knows how to retrieve its own data, because some
# widgets split data over several HTML fields.
if field.disabled:
value = self.get_initial_for_field(field, name)
else:
2020-02-10 03:59:39 +00:00
value = field.widget.value_from_datadict(
self.data, self.files, self.add_prefix(name))
from django.core.exceptions import ValidationError
try:
# We don't want to check a field when we enter an ISBN.
2020-05-21 14:56:41 +00:00
if "isbn" not in self.data \
2020-02-10 03:59:39 +00:00
or not self.cleaned_data.get('isbn'):
value = field.clean(value)
self.cleaned_data[name] = value
if hasattr(self, 'clean_%s' % name):
value = getattr(self, 'clean_%s' % name)()
self.cleaned_data[name] = value
except ValidationError as e:
self.add_error(name, e)
2020-05-21 14:56:41 +00:00
class Meta:
2020-05-22 16:04:41 +00:00
model = BD
2020-05-21 14:56:41 +00:00
fields = '__all__'