Fix authors & openlibrary scrap

This commit is contained in:
Yohann D'ANELLO 2020-02-10 11:29:26 +01:00
parent c7d804d9bf
commit 7fd8e92371
2 changed files with 45 additions and 20 deletions

View File

@ -7,6 +7,7 @@ import urllib.request
from django.forms import ModelForm from django.forms import ModelForm
from .models import Auteur
from .scraper import BedetequeScraper from .scraper import BedetequeScraper
@ -30,6 +31,7 @@ class MediaAdminForm(ModelForm):
# If results, then take the most accurate # If results, then take the most accurate
data = scraper.scrap_bd_info(r[0]) data = scraper.scrap_bd_info(r[0])
self.cleaned_data.update(data) self.cleaned_data.update(data)
print(self.cleaned_data)
return True return True
def download_data_openlibrary(self, isbn): def download_data_openlibrary(self, isbn):
@ -53,6 +55,23 @@ class MediaAdminForm(ModelForm):
if 'number_of_pages' in data: if 'number_of_pages' in data:
self.cleaned_data['number_of_pages'] = \ self.cleaned_data['number_of_pages'] = \
data['number_of_pages'] data['number_of_pages']
elif not self.cleaned_data['number_of_pages']:
self.cleaned_data['number_of_pages'] = 0
if 'publish_date' in data:
months = ['January', 'February', "March", "April", "Mai",
"June", "July", "August", "September",
"October", "November","December"]
split = data['publish_date'].replace(',', '').split(' ')
self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}"\
.format(split[2], months.index(split[0])+1,
int(split[1]),)
if 'authors' in data:
if 'author' not in self.cleaned_data:
self.cleaned_data['authors'] = list()
for author in data['authors']:
author_obj = Auteur.objects.get_or_create(
name=author['name'])[0]
self.cleaned_data['authors'].append(author_obj)
return True return True
return False return False
@ -62,7 +81,6 @@ class MediaAdminForm(ModelForm):
""" """
super().clean() super().clean()
# TODO implement authors, side_identifier
if "_continue" in self.request.POST: if "_continue" in self.request.POST:
isbn = self.cleaned_data.get('isbn') isbn = self.cleaned_data.get('isbn')
if isbn: if isbn:
@ -72,6 +90,23 @@ class MediaAdminForm(ModelForm):
# Try with OpenLibrary # Try with OpenLibrary
self.download_data_openlibrary(isbn) self.download_data_openlibrary(isbn)
if self.cleaned_data['authors']:
author_name = self.cleaned_data['authors'][0].name
if ',' not in author_name and ' ' in author_name:
author_name = author_name.split(' ')[1]
side_identifier = "{:.3} {:.3}".format(
author_name.upper(),
self.cleaned_data['title'].upper(), )
if self.cleaned_data['subtitle']:
start = self.cleaned_data['subtitle'] \
.split(' ')[0].replace('.', '')
if start.isnumeric():
side_identifier += " {:0>2}".format(start, )
self.cleaned_data['side_identifier'] = side_identifier
return self.cleaned_data return self.cleaned_data
def _clean_fields(self): def _clean_fields(self):

View File

@ -5,6 +5,8 @@ import re
import requests import requests
from media.models import Auteur
class BedetequeScraper: class BedetequeScraper:
""" """
@ -75,10 +77,6 @@ class BedetequeScraper:
subtitle = subtitle.replace('<span class="numa"></span>', '') subtitle = subtitle.replace('<span class="numa"></span>', '')
data['subtitle'] = ' '.join(subtitle.split()) data['subtitle'] = ' '.join(subtitle.split())
# TODO implement author
# regex_author = r'author\">([^<]*)</span'
# 'author': re.search(regex_author, content).group(1),
# Get publish date # Get publish date
search_publish_date = re.search(regex_publish_date, content) search_publish_date = re.search(regex_publish_date, content)
if search_publish_date: if search_publish_date:
@ -92,23 +90,15 @@ class BedetequeScraper:
# Get author and illustrator # Get author and illustrator
author = re.search(regex_author, content) author = re.search(regex_author, content)
if 'author' not in data: if 'author' not in data:
data['author'] = list() data['authors'] = list()
if author: if author:
data['author'].append(author.group(1)) author_obj = Auteur.objects.get_or_create(
name=author.group(1))[0]
data['authors'].append(author_obj)
illustrator = re.search(regex_illustrator, content) illustrator = re.search(regex_illustrator, content)
if illustrator: if illustrator:
data['author'].append(illustrator.group(1)) author_obj = Auteur.objects.get_or_create(
name=illustrator.group(1))[0]
author_name = data['author'][0] data['authors'].append(author_obj)
if ',' not in author_name and ' ' in author_name:
author_name = author_name.split(' ')[1]
side_identifier = "{:.3} {:.3}".format(author_name.upper(),
data['title'].upper(),)
if data['subtitle']:
start = data['subtitle'].split(' ')[0].replace('.', '')
print("start:", start)
if start.isnumeric():
side_identifier += " {:0>2}".format(start,)
data['side_identifier'] = side_identifier
return data return data