Add bedeteque scraper

This commit is contained in:
Alexandre Iooss 2019-08-19 12:17:49 +02:00
parent d6df704d09
commit a6b5d8b822
No known key found for this signature in database
GPG Key ID: 6C79278F3FCDCC02
5 changed files with 131 additions and 20 deletions

View File

@ -7,6 +7,8 @@ import urllib.request
from django.forms import ModelForm
from .scraper import BedetequeScraper
class MediaAdminForm(ModelForm):
def __init__(self, *args, **kwargs):
@ -16,9 +18,24 @@ class MediaAdminForm(ModelForm):
isbn_field.widget.template_name = "media/isbn_button.html"
isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
def download_data(self, isbn):
def download_data_bedeteque(self, isbn):
"""
Download data from ISBN
Download data from bedeteque
:return True if success
"""
scraper = BedetequeScraper()
r = scraper.search_by_isbn(isbn)
if not r:
return False
# If results, then take the most accurate
data = scraper.scrap_bd_info(r[0])
self.cleaned_data.update(data)
return True
def download_data_openlibrary(self, isbn):
"""
Download data from openlibrary
:return True if success
"""
api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
"&format=json&jscmd=data".format(isbn)
@ -26,27 +43,31 @@ class MediaAdminForm(ModelForm):
data = json.loads(url.read().decode())
if data and data['ISBN:' + isbn]:
data = data['ISBN:' + isbn]
if 'url' in data:
# Fill the data
# TODO implement authors, side_identifier
self.cleaned_data['external_url'] = data['url']
if 'title' in data:
self.cleaned_data['title'] = data['title']
if 'subtitle' in data:
self.cleaned_data['subtitle'] = data['subtitle']
if 'url' in data:
self.cleaned_data['external_url'] = data['url']
if 'number_of_pages' in data:
self.cleaned_data['number_of_pages'] = \
data['number_of_pages']
return True
return False
def clean(self):
"""
If user fetch ISBN data, then download data before validating the form
"""
# TODO implement authors, side_identifier
if "_continue" in self.request.POST:
isbn = self.cleaned_data.get('isbn')
if isbn:
# ISBN is present
self.download_data(isbn)
# ISBN is present, try with bedeteque
scrap_result = self.download_data_bedeteque(isbn)
if not scrap_result:
# Try with OpenLibrary
self.download_data_openlibrary(isbn)
return super().clean()

90
media/scraper.py Normal file
View File

@ -0,0 +1,90 @@
# -*- mode: python; coding: utf-8 -*-
# SPDX-License-Identifier: GPL-3.0-or-later
import re
import requests
class BedetequeScraper:
"""
Tool to scrap bedeteque.com
"""
def __init__(self):
self.session = requests.session()
self.referer = "https://www.bedetheque.com/search/albums"
self.csrf_token = self._get_csrf_token()
def _get_csrf_token(self) -> str:
"""
Get CSRF token for session
:return: CSRF token
"""
response = self.session.get(self.referer).content.decode()
regex = r'csrf_token_bedetheque\"\s*value=\"(\w*)\"'
return re.search(regex, response).group(1)
def search_by_isbn(self, isbn: str) -> [str]:
"""
Make the request and return
:param isbn: ISBN we are searching
:return: list of URLs matching ISBN
"""
scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \
"&RechIdAuteur=undefined&csrf_token_bedetheque={}" \
"&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \
"&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \
"&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \
"&RechDLFin=&RechCoteMin=&RechCoteMax=" \
"&RechEO=0".format(self.csrf_token, isbn)
result = self.session.get(scrap_url, headers={'Referer': self.referer})
content = result.content.decode()
regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"'
return re.findall(regex, content)
def scrap_bd_info(self, bd_url: str) -> dict:
"""
Load BD web page and scrap data
:param bd_url: URL where to find BD data
:return: BD data
"""
result = self.session.get(bd_url)
content = result.content.decode().replace('\r', '')
content = content.replace('\n', '')
regex_title = r'<h1>\s*<a[^>]*>(.*)</a>\s*</h1>'
regex_subtitle = r'<h2>\s*(.*)</h2>'
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
data = {
'external_url': bd_url,
}
# Get title
search_title = re.search(regex_title, content)
if search_title:
data['title'] = search_title.group(1).strip()
# For subtitle, take serie number into account
search_subtitle = re.search(regex_subtitle, content)
if search_subtitle:
subtitle = search_subtitle.group(1)
subtitle = subtitle.replace('<span class="numa"></span>', '')
data['subtitle'] = ' '.join(subtitle.split())
# TODO implement author
# regex_author = r'author\">([^<]*)</span'
# 'author': re.search(regex_author, content).group(1),
# Get publish date
search_publish_date = re.search(regex_publish_date, content)
if search_publish_date:
data['publish_date'] = search_publish_date.group(1)
# Get number of pages
search_nb_pages = re.search(regex_nb_of_pages, content)
if search_nb_pages and search_nb_pages.group(1).isnumeric():
data['number_of_pages'] = search_nb_pages.group(1)
return data

View File

@ -1,6 +1,6 @@
from rest_framework import serializers
from .models import Auteur, Media, Emprunt, Jeu
from .models import Auteur, Emprunt, Jeu, Media
class AuteurSerializer(serializers.HyperlinkedModelSerializer):

View File

@ -11,9 +11,9 @@ from django.utils.translation import gettext_lazy as _
from rest_framework import viewsets
from reversion import revisions as reversion
from .models import Auteur, Media, Emprunt, Jeu
from .serializers import AuteurSerializer, MediaSerializer, \
EmpruntSerializer, JeuSerializer
from .models import Auteur, Emprunt, Jeu, Media
from .serializers import AuteurSerializer, EmpruntSerializer, \
JeuSerializer, MediaSerializer
@login_required

View File

@ -4,7 +4,7 @@
from django.contrib import messages
from django.contrib.auth.decorators import login_required, permission_required
from django.contrib.auth.models import User, Group
from django.contrib.auth.models import Group
from django.db import transaction
from django.shortcuts import redirect, render
from django.template.context_processors import csrf
@ -14,7 +14,7 @@ from reversion import revisions as reversion
from users.forms import BaseInfoForm
from users.models import Adhesion, User
from .serializers import UserSerializer, GroupSerializer
from .serializers import GroupSerializer, UserSerializer
def form(ctx, template, request):