diff --git a/media/forms.py b/media/forms.py index 8d928dc..24b5cbd 100644 --- a/media/forms.py +++ b/media/forms.py @@ -7,6 +7,8 @@ import urllib.request from django.forms import ModelForm +from .scraper import BedetequeScraper + class MediaAdminForm(ModelForm): def __init__(self, *args, **kwargs): @@ -16,9 +18,24 @@ class MediaAdminForm(ModelForm): isbn_field.widget.template_name = "media/isbn_button.html" isbn_field.widget.attrs.update({'autofocus': 'autofocus'}) - def download_data(self, isbn): + def download_data_bedeteque(self, isbn): """ - Download data from ISBN + Download data from bedeteque + :return True if success + """ + scraper = BedetequeScraper() + r = scraper.search_by_isbn(isbn) + if not r: + return False + # If results, then take the most accurate + data = scraper.scrap_bd_info(r[0]) + self.cleaned_data.update(data) + return True + + def download_data_openlibrary(self, isbn): + """ + Download data from openlibrary + :return True if success """ api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \ "&format=json&jscmd=data".format(isbn) @@ -26,27 +43,31 @@ class MediaAdminForm(ModelForm): data = json.loads(url.read().decode()) if data and data['ISBN:' + isbn]: data = data['ISBN:' + isbn] - - # Fill the data - # TODO implement authors, side_identifier - if 'title' in data: - self.cleaned_data['title'] = data['title'] - if 'subtitle' in data: - self.cleaned_data['subtitle'] = data['subtitle'] if 'url' in data: + # Fill the data self.cleaned_data['external_url'] = data['url'] - if 'number_of_pages' in data: - self.cleaned_data['number_of_pages'] = \ - data['number_of_pages'] + if 'title' in data: + self.cleaned_data['title'] = data['title'] + if 'subtitle' in data: + self.cleaned_data['subtitle'] = data['subtitle'] + if 'number_of_pages' in data: + self.cleaned_data['number_of_pages'] = \ + data['number_of_pages'] + return True + return False def clean(self): """ If user fetch ISBN data, then download data before validating the form """ + # TODO implement authors, side_identifier if "_continue" in self.request.POST: isbn = self.cleaned_data.get('isbn') if isbn: - # ISBN is present - self.download_data(isbn) + # ISBN is present, try with bedeteque + scrap_result = self.download_data_bedeteque(isbn) + if not scrap_result: + # Try with OpenLibrary + self.download_data_openlibrary(isbn) return super().clean() diff --git a/media/scraper.py b/media/scraper.py new file mode 100644 index 0000000..ec91d5e --- /dev/null +++ b/media/scraper.py @@ -0,0 +1,90 @@ +# -*- mode: python; coding: utf-8 -*- +# SPDX-License-Identifier: GPL-3.0-or-later + +import re + +import requests + + +class BedetequeScraper: + """ + Tool to scrap bedeteque.com + """ + + def __init__(self): + self.session = requests.session() + self.referer = "https://www.bedetheque.com/search/albums" + self.csrf_token = self._get_csrf_token() + + def _get_csrf_token(self) -> str: + """ + Get CSRF token for session + :return: CSRF token + """ + response = self.session.get(self.referer).content.decode() + regex = r'csrf_token_bedetheque\"\s*value=\"(\w*)\"' + return re.search(regex, response).group(1) + + def search_by_isbn(self, isbn: str) -> [str]: + """ + Make the request and return + :param isbn: ISBN we are searching + :return: list of URLs matching ISBN + """ + scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \ + "&RechIdAuteur=undefined&csrf_token_bedetheque={}" \ + "&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \ + "&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \ + "&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \ + "&RechDLFin=&RechCoteMin=&RechCoteMax=" \ + "&RechEO=0".format(self.csrf_token, isbn) + result = self.session.get(scrap_url, headers={'Referer': self.referer}) + content = result.content.decode() + regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"' + return re.findall(regex, content) + + def scrap_bd_info(self, bd_url: str) -> dict: + """ + Load BD web page and scrap data + :param bd_url: URL where to find BD data + :return: BD data + """ + result = self.session.get(bd_url) + content = result.content.decode().replace('\r', '') + content = content.replace('\n', '') + regex_title = r'

\s*]*>(.*)\s*

' + regex_subtitle = r'

\s*(.*)

' + regex_publish_date = r'datePublished\" content=\"([\d-]*)\">' + regex_nb_of_pages = r'numberOfPages\">(\d*)', '') + data['subtitle'] = ' '.join(subtitle.split()) + + # TODO implement author + # regex_author = r'author\">([^<]*)