Add bedeteque scraper

2019-08-19 12:17:49 +02:00 · 2019-08-19 12:17:49 +02:00 · a6b5d8b822
parent d6df704d09
commit a6b5d8b822
5 changed files with 131 additions and 20 deletions
--- a/media/forms.py
+++ b/media/forms.py
@ -7,6 +7,8 @@ import urllib.request

 from django.forms import ModelForm

+from .scraper import BedetequeScraper
+

 class MediaAdminForm(ModelForm):
    def __init__(self, *args, **kwargs):
@ -16,9 +18,24 @@ class MediaAdminForm(ModelForm):
            isbn_field.widget.template_name = "media/isbn_button.html"
            isbn_field.widget.attrs.update({'autofocus': 'autofocus'})

-    def download_data(self, isbn):
+    def download_data_bedeteque(self, isbn):
        """
-        Download data from ISBN
+        Download data from bedeteque
+        :return True if success
+        """
+        scraper = BedetequeScraper()
+        r = scraper.search_by_isbn(isbn)
+        if not r:
+            return False
+        # If results, then take the most accurate
+        data = scraper.scrap_bd_info(r[0])
+        self.cleaned_data.update(data)
+        return True
+
+    def download_data_openlibrary(self, isbn):
+        """
+        Download data from openlibrary
+        :return True if success
        """
        api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
                  "&format=json&jscmd=data".format(isbn)
@ -26,27 +43,31 @@ class MediaAdminForm(ModelForm):
            data = json.loads(url.read().decode())
        if data and data['ISBN:' + isbn]:
            data = data['ISBN:' + isbn]
-
+            if 'url' in data:
                # Fill the data
-            # TODO implement authors, side_identifier
+                self.cleaned_data['external_url'] = data['url']
                if 'title' in data:
                    self.cleaned_data['title'] = data['title']
                if 'subtitle' in data:
                    self.cleaned_data['subtitle'] = data['subtitle']
-            if 'url' in data:
-                self.cleaned_data['external_url'] = data['url']
                if 'number_of_pages' in data:
                    self.cleaned_data['number_of_pages'] = \
                        data['number_of_pages']
+                return True
+        return False

    def clean(self):
        """
        If user fetch ISBN data, then download data before validating the form
        """
+        # TODO implement authors, side_identifier
        if "_continue" in self.request.POST:
            isbn = self.cleaned_data.get('isbn')
            if isbn:
-                # ISBN is present
-                self.download_data(isbn)
+                # ISBN is present, try with bedeteque
+                scrap_result = self.download_data_bedeteque(isbn)
+                if not scrap_result:
+                    # Try with OpenLibrary
+                    self.download_data_openlibrary(isbn)

        return super().clean()
--- a/media/scraper.py
+++ b/media/scraper.py
@ -0,0 +1,90 @@
+# -*- mode: python; coding: utf-8 -*-
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import re
+
+import requests
+
+
+class BedetequeScraper:
+    """
+    Tool to scrap bedeteque.com
+    """
+
+    def __init__(self):
+        self.session = requests.session()
+        self.referer = "https://www.bedetheque.com/search/albums"
+        self.csrf_token = self._get_csrf_token()
+
+    def _get_csrf_token(self) -> str:
+        """
+        Get CSRF token for session
+        :return: CSRF token
+        """
+        response = self.session.get(self.referer).content.decode()
+        regex = r'csrf_token_bedetheque\"\s*value=\"(\w*)\"'
+        return re.search(regex, response).group(1)
+
+    def search_by_isbn(self, isbn: str) -> [str]:
+        """
+        Make the request and return
+        :param isbn: ISBN we are searching
+        :return: list of URLs matching ISBN
+        """
+        scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \
+                    "&RechIdAuteur=undefined&csrf_token_bedetheque={}" \
+                    "&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \
+                    "&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \
+                    "&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \
+                    "&RechDLFin=&RechCoteMin=&RechCoteMax=" \
+                    "&RechEO=0".format(self.csrf_token, isbn)
+        result = self.session.get(scrap_url, headers={'Referer': self.referer})
+        content = result.content.decode()
+        regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"'
+        return re.findall(regex, content)
+
+    def scrap_bd_info(self, bd_url: str) -> dict:
+        """
+        Load BD web page and scrap data
+        :param bd_url: URL where to find BD data
+        :return: BD data
+        """
+        result = self.session.get(bd_url)
+        content = result.content.decode().replace('\r', '')
+        content = content.replace('\n', '')
+        regex_title = r'<h1>\s*<a[^>]*>(.*)</a>\s*</h1>'
+        regex_subtitle = r'<h2>\s*(.*)</h2>'
+        regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
+        regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
+
+        data = {
+            'external_url': bd_url,
+        }
+
+        # Get title
+        search_title = re.search(regex_title, content)
+        if search_title:
+            data['title'] = search_title.group(1).strip()
+
+        # For subtitle, take serie number into account
+        search_subtitle = re.search(regex_subtitle, content)
+        if search_subtitle:
+            subtitle = search_subtitle.group(1)
+            subtitle = subtitle.replace('<span class="numa"></span>', '')
+            data['subtitle'] = ' '.join(subtitle.split())
+
+        # TODO implement author
+        # regex_author = r'author\">([^<]*)</span'
+        # 'author': re.search(regex_author, content).group(1),
+
+        # Get publish date
+        search_publish_date = re.search(regex_publish_date, content)
+        if search_publish_date:
+            data['publish_date'] = search_publish_date.group(1)
+
+        # Get number of pages
+        search_nb_pages = re.search(regex_nb_of_pages, content)
+        if search_nb_pages and search_nb_pages.group(1).isnumeric():
+            data['number_of_pages'] = search_nb_pages.group(1)
+
+        return data
--- a/media/serializers.py
+++ b/media/serializers.py
@ -1,6 +1,6 @@
 from rest_framework import serializers

-from .models import Auteur, Media, Emprunt, Jeu
+from .models import Auteur, Emprunt, Jeu, Media


 class AuteurSerializer(serializers.HyperlinkedModelSerializer):
--- a/media/views.py
+++ b/media/views.py
@ -11,9 +11,9 @@ from django.utils.translation import gettext_lazy as _
 from rest_framework import viewsets
 from reversion import revisions as reversion

-from .models import Auteur, Media, Emprunt, Jeu
-from .serializers import AuteurSerializer, MediaSerializer, \
-    EmpruntSerializer, JeuSerializer
+from .models import Auteur, Emprunt, Jeu, Media
+from .serializers import AuteurSerializer, EmpruntSerializer, \
+    JeuSerializer, MediaSerializer


@login_required
--- a/users/views.py
+++ b/users/views.py
@ -4,7 +4,7 @@

 from django.contrib import messages
 from django.contrib.auth.decorators import login_required, permission_required
-from django.contrib.auth.models import User, Group
+from django.contrib.auth.models import Group
 from django.db import transaction
 from django.shortcuts import redirect, render
 from django.template.context_processors import csrf
@ -14,7 +14,7 @@ from reversion import revisions as reversion

 from users.forms import BaseInfoForm
 from users.models import Adhesion, User
-from .serializers import UserSerializer, GroupSerializer
+from .serializers import GroupSerializer, UserSerializer


 def form(ctx, template, request):