1
0
mirror of https://gitlab.crans.org/mediatek/med.git synced 2024-11-26 20:47:10 +00:00

Add bedeteque scraper

This commit is contained in:
Alexandre Iooss 2019-08-19 12:17:49 +02:00
parent d6df704d09
commit a6b5d8b822
No known key found for this signature in database
GPG Key ID: 6C79278F3FCDCC02
5 changed files with 131 additions and 20 deletions

View File

@ -7,6 +7,8 @@ import urllib.request
from django.forms import ModelForm from django.forms import ModelForm
from .scraper import BedetequeScraper
class MediaAdminForm(ModelForm): class MediaAdminForm(ModelForm):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@ -16,9 +18,24 @@ class MediaAdminForm(ModelForm):
isbn_field.widget.template_name = "media/isbn_button.html" isbn_field.widget.template_name = "media/isbn_button.html"
isbn_field.widget.attrs.update({'autofocus': 'autofocus'}) isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
def download_data(self, isbn): def download_data_bedeteque(self, isbn):
""" """
Download data from ISBN Download data from bedeteque
:return True if success
"""
scraper = BedetequeScraper()
r = scraper.search_by_isbn(isbn)
if not r:
return False
# If results, then take the most accurate
data = scraper.scrap_bd_info(r[0])
self.cleaned_data.update(data)
return True
def download_data_openlibrary(self, isbn):
"""
Download data from openlibrary
:return True if success
""" """
api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \ api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
"&format=json&jscmd=data".format(isbn) "&format=json&jscmd=data".format(isbn)
@ -26,27 +43,31 @@ class MediaAdminForm(ModelForm):
data = json.loads(url.read().decode()) data = json.loads(url.read().decode())
if data and data['ISBN:' + isbn]: if data and data['ISBN:' + isbn]:
data = data['ISBN:' + isbn] data = data['ISBN:' + isbn]
if 'url' in data:
# Fill the data # Fill the data
# TODO implement authors, side_identifier self.cleaned_data['external_url'] = data['url']
if 'title' in data: if 'title' in data:
self.cleaned_data['title'] = data['title'] self.cleaned_data['title'] = data['title']
if 'subtitle' in data: if 'subtitle' in data:
self.cleaned_data['subtitle'] = data['subtitle'] self.cleaned_data['subtitle'] = data['subtitle']
if 'url' in data:
self.cleaned_data['external_url'] = data['url']
if 'number_of_pages' in data: if 'number_of_pages' in data:
self.cleaned_data['number_of_pages'] = \ self.cleaned_data['number_of_pages'] = \
data['number_of_pages'] data['number_of_pages']
return True
return False
def clean(self): def clean(self):
""" """
If user fetch ISBN data, then download data before validating the form If user fetch ISBN data, then download data before validating the form
""" """
# TODO implement authors, side_identifier
if "_continue" in self.request.POST: if "_continue" in self.request.POST:
isbn = self.cleaned_data.get('isbn') isbn = self.cleaned_data.get('isbn')
if isbn: if isbn:
# ISBN is present # ISBN is present, try with bedeteque
self.download_data(isbn) scrap_result = self.download_data_bedeteque(isbn)
if not scrap_result:
# Try with OpenLibrary
self.download_data_openlibrary(isbn)
return super().clean() return super().clean()

90
media/scraper.py Normal file
View File

@ -0,0 +1,90 @@
# -*- mode: python; coding: utf-8 -*-
# SPDX-License-Identifier: GPL-3.0-or-later
import re
import requests
class BedetequeScraper:
"""
Tool to scrap bedeteque.com
"""
def __init__(self):
self.session = requests.session()
self.referer = "https://www.bedetheque.com/search/albums"
self.csrf_token = self._get_csrf_token()
def _get_csrf_token(self) -> str:
"""
Get CSRF token for session
:return: CSRF token
"""
response = self.session.get(self.referer).content.decode()
regex = r'csrf_token_bedetheque\"\s*value=\"(\w*)\"'
return re.search(regex, response).group(1)
def search_by_isbn(self, isbn: str) -> [str]:
"""
Make the request and return
:param isbn: ISBN we are searching
:return: list of URLs matching ISBN
"""
scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \
"&RechIdAuteur=undefined&csrf_token_bedetheque={}" \
"&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \
"&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \
"&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \
"&RechDLFin=&RechCoteMin=&RechCoteMax=" \
"&RechEO=0".format(self.csrf_token, isbn)
result = self.session.get(scrap_url, headers={'Referer': self.referer})
content = result.content.decode()
regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"'
return re.findall(regex, content)
def scrap_bd_info(self, bd_url: str) -> dict:
"""
Load BD web page and scrap data
:param bd_url: URL where to find BD data
:return: BD data
"""
result = self.session.get(bd_url)
content = result.content.decode().replace('\r', '')
content = content.replace('\n', '')
regex_title = r'<h1>\s*<a[^>]*>(.*)</a>\s*</h1>'
regex_subtitle = r'<h2>\s*(.*)</h2>'
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
data = {
'external_url': bd_url,
}
# Get title
search_title = re.search(regex_title, content)
if search_title:
data['title'] = search_title.group(1).strip()
# For subtitle, take serie number into account
search_subtitle = re.search(regex_subtitle, content)
if search_subtitle:
subtitle = search_subtitle.group(1)
subtitle = subtitle.replace('<span class="numa"></span>', '')
data['subtitle'] = ' '.join(subtitle.split())
# TODO implement author
# regex_author = r'author\">([^<]*)</span'
# 'author': re.search(regex_author, content).group(1),
# Get publish date
search_publish_date = re.search(regex_publish_date, content)
if search_publish_date:
data['publish_date'] = search_publish_date.group(1)
# Get number of pages
search_nb_pages = re.search(regex_nb_of_pages, content)
if search_nb_pages and search_nb_pages.group(1).isnumeric():
data['number_of_pages'] = search_nb_pages.group(1)
return data

View File

@ -1,6 +1,6 @@
from rest_framework import serializers from rest_framework import serializers
from .models import Auteur, Media, Emprunt, Jeu from .models import Auteur, Emprunt, Jeu, Media
class AuteurSerializer(serializers.HyperlinkedModelSerializer): class AuteurSerializer(serializers.HyperlinkedModelSerializer):

View File

@ -11,9 +11,9 @@ from django.utils.translation import gettext_lazy as _
from rest_framework import viewsets from rest_framework import viewsets
from reversion import revisions as reversion from reversion import revisions as reversion
from .models import Auteur, Media, Emprunt, Jeu from .models import Auteur, Emprunt, Jeu, Media
from .serializers import AuteurSerializer, MediaSerializer, \ from .serializers import AuteurSerializer, EmpruntSerializer, \
EmpruntSerializer, JeuSerializer JeuSerializer, MediaSerializer
@login_required @login_required

View File

@ -4,7 +4,7 @@
from django.contrib import messages from django.contrib import messages
from django.contrib.auth.decorators import login_required, permission_required from django.contrib.auth.decorators import login_required, permission_required
from django.contrib.auth.models import User, Group from django.contrib.auth.models import Group
from django.db import transaction from django.db import transaction
from django.shortcuts import redirect, render from django.shortcuts import redirect, render
from django.template.context_processors import csrf from django.template.context_processors import csrf
@ -14,7 +14,7 @@ from reversion import revisions as reversion
from users.forms import BaseInfoForm from users.forms import BaseInfoForm
from users.models import Adhesion, User from users.models import Adhesion, User
from .serializers import UserSerializer, GroupSerializer from .serializers import GroupSerializer, UserSerializer
def form(ctx, template, request): def form(ctx, template, request):