mirror of
https://gitlab.crans.org/mediatek/med.git
synced 2024-11-26 20:47:10 +00:00
Fix authors & openlibrary scrap
This commit is contained in:
parent
c7d804d9bf
commit
7fd8e92371
@ -7,6 +7,7 @@ import urllib.request
|
|||||||
|
|
||||||
from django.forms import ModelForm
|
from django.forms import ModelForm
|
||||||
|
|
||||||
|
from .models import Auteur
|
||||||
from .scraper import BedetequeScraper
|
from .scraper import BedetequeScraper
|
||||||
|
|
||||||
|
|
||||||
@ -30,6 +31,7 @@ class MediaAdminForm(ModelForm):
|
|||||||
# If results, then take the most accurate
|
# If results, then take the most accurate
|
||||||
data = scraper.scrap_bd_info(r[0])
|
data = scraper.scrap_bd_info(r[0])
|
||||||
self.cleaned_data.update(data)
|
self.cleaned_data.update(data)
|
||||||
|
print(self.cleaned_data)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def download_data_openlibrary(self, isbn):
|
def download_data_openlibrary(self, isbn):
|
||||||
@ -53,6 +55,23 @@ class MediaAdminForm(ModelForm):
|
|||||||
if 'number_of_pages' in data:
|
if 'number_of_pages' in data:
|
||||||
self.cleaned_data['number_of_pages'] = \
|
self.cleaned_data['number_of_pages'] = \
|
||||||
data['number_of_pages']
|
data['number_of_pages']
|
||||||
|
elif not self.cleaned_data['number_of_pages']:
|
||||||
|
self.cleaned_data['number_of_pages'] = 0
|
||||||
|
if 'publish_date' in data:
|
||||||
|
months = ['January', 'February', "March", "April", "Mai",
|
||||||
|
"June", "July", "August", "September",
|
||||||
|
"October", "November","December"]
|
||||||
|
split = data['publish_date'].replace(',', '').split(' ')
|
||||||
|
self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}"\
|
||||||
|
.format(split[2], months.index(split[0])+1,
|
||||||
|
int(split[1]),)
|
||||||
|
if 'authors' in data:
|
||||||
|
if 'author' not in self.cleaned_data:
|
||||||
|
self.cleaned_data['authors'] = list()
|
||||||
|
for author in data['authors']:
|
||||||
|
author_obj = Auteur.objects.get_or_create(
|
||||||
|
name=author['name'])[0]
|
||||||
|
self.cleaned_data['authors'].append(author_obj)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -62,7 +81,6 @@ class MediaAdminForm(ModelForm):
|
|||||||
"""
|
"""
|
||||||
super().clean()
|
super().clean()
|
||||||
|
|
||||||
# TODO implement authors, side_identifier
|
|
||||||
if "_continue" in self.request.POST:
|
if "_continue" in self.request.POST:
|
||||||
isbn = self.cleaned_data.get('isbn')
|
isbn = self.cleaned_data.get('isbn')
|
||||||
if isbn:
|
if isbn:
|
||||||
@ -72,6 +90,23 @@ class MediaAdminForm(ModelForm):
|
|||||||
# Try with OpenLibrary
|
# Try with OpenLibrary
|
||||||
self.download_data_openlibrary(isbn)
|
self.download_data_openlibrary(isbn)
|
||||||
|
|
||||||
|
if self.cleaned_data['authors']:
|
||||||
|
author_name = self.cleaned_data['authors'][0].name
|
||||||
|
if ',' not in author_name and ' ' in author_name:
|
||||||
|
author_name = author_name.split(' ')[1]
|
||||||
|
side_identifier = "{:.3} {:.3}".format(
|
||||||
|
author_name.upper(),
|
||||||
|
self.cleaned_data['title'].upper(), )
|
||||||
|
|
||||||
|
if self.cleaned_data['subtitle']:
|
||||||
|
start = self.cleaned_data['subtitle'] \
|
||||||
|
.split(' ')[0].replace('.', '')
|
||||||
|
|
||||||
|
if start.isnumeric():
|
||||||
|
side_identifier += " {:0>2}".format(start, )
|
||||||
|
|
||||||
|
self.cleaned_data['side_identifier'] = side_identifier
|
||||||
|
|
||||||
return self.cleaned_data
|
return self.cleaned_data
|
||||||
|
|
||||||
def _clean_fields(self):
|
def _clean_fields(self):
|
||||||
|
@ -5,6 +5,8 @@ import re
|
|||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from media.models import Auteur
|
||||||
|
|
||||||
|
|
||||||
class BedetequeScraper:
|
class BedetequeScraper:
|
||||||
"""
|
"""
|
||||||
@ -75,10 +77,6 @@ class BedetequeScraper:
|
|||||||
subtitle = subtitle.replace('<span class="numa"></span>', '')
|
subtitle = subtitle.replace('<span class="numa"></span>', '')
|
||||||
data['subtitle'] = ' '.join(subtitle.split())
|
data['subtitle'] = ' '.join(subtitle.split())
|
||||||
|
|
||||||
# TODO implement author
|
|
||||||
# regex_author = r'author\">([^<]*)</span'
|
|
||||||
# 'author': re.search(regex_author, content).group(1),
|
|
||||||
|
|
||||||
# Get publish date
|
# Get publish date
|
||||||
search_publish_date = re.search(regex_publish_date, content)
|
search_publish_date = re.search(regex_publish_date, content)
|
||||||
if search_publish_date:
|
if search_publish_date:
|
||||||
@ -92,23 +90,15 @@ class BedetequeScraper:
|
|||||||
# Get author and illustrator
|
# Get author and illustrator
|
||||||
author = re.search(regex_author, content)
|
author = re.search(regex_author, content)
|
||||||
if 'author' not in data:
|
if 'author' not in data:
|
||||||
data['author'] = list()
|
data['authors'] = list()
|
||||||
if author:
|
if author:
|
||||||
data['author'].append(author.group(1))
|
author_obj = Auteur.objects.get_or_create(
|
||||||
|
name=author.group(1))[0]
|
||||||
|
data['authors'].append(author_obj)
|
||||||
illustrator = re.search(regex_illustrator, content)
|
illustrator = re.search(regex_illustrator, content)
|
||||||
if illustrator:
|
if illustrator:
|
||||||
data['author'].append(illustrator.group(1))
|
author_obj = Auteur.objects.get_or_create(
|
||||||
|
name=illustrator.group(1))[0]
|
||||||
author_name = data['author'][0]
|
data['authors'].append(author_obj)
|
||||||
if ',' not in author_name and ' ' in author_name:
|
|
||||||
author_name = author_name.split(' ')[1]
|
|
||||||
side_identifier = "{:.3} {:.3}".format(author_name.upper(),
|
|
||||||
data['title'].upper(),)
|
|
||||||
if data['subtitle']:
|
|
||||||
start = data['subtitle'].split(' ')[0].replace('.', '')
|
|
||||||
print("start:", start)
|
|
||||||
if start.isnumeric():
|
|
||||||
side_identifier += " {:0>2}".format(start,)
|
|
||||||
data['side_identifier'] = side_identifier
|
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
Loading…
Reference in New Issue
Block a user