mirror of
https://gitlab.crans.org/mediatek/med.git
synced 2025-01-11 23:02:26 +00:00
Add bedeteque scraper
This commit is contained in:
parent
d6df704d09
commit
a6b5d8b822
@ -7,6 +7,8 @@ import urllib.request
|
||||
|
||||
from django.forms import ModelForm
|
||||
|
||||
from .scraper import BedetequeScraper
|
||||
|
||||
|
||||
class MediaAdminForm(ModelForm):
|
||||
def __init__(self, *args, **kwargs):
|
||||
@ -16,9 +18,24 @@ class MediaAdminForm(ModelForm):
|
||||
isbn_field.widget.template_name = "media/isbn_button.html"
|
||||
isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
|
||||
|
||||
def download_data(self, isbn):
|
||||
def download_data_bedeteque(self, isbn):
|
||||
"""
|
||||
Download data from ISBN
|
||||
Download data from bedeteque
|
||||
:return True if success
|
||||
"""
|
||||
scraper = BedetequeScraper()
|
||||
r = scraper.search_by_isbn(isbn)
|
||||
if not r:
|
||||
return False
|
||||
# If results, then take the most accurate
|
||||
data = scraper.scrap_bd_info(r[0])
|
||||
self.cleaned_data.update(data)
|
||||
return True
|
||||
|
||||
def download_data_openlibrary(self, isbn):
|
||||
"""
|
||||
Download data from openlibrary
|
||||
:return True if success
|
||||
"""
|
||||
api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
|
||||
"&format=json&jscmd=data".format(isbn)
|
||||
@ -26,27 +43,31 @@ class MediaAdminForm(ModelForm):
|
||||
data = json.loads(url.read().decode())
|
||||
if data and data['ISBN:' + isbn]:
|
||||
data = data['ISBN:' + isbn]
|
||||
|
||||
# Fill the data
|
||||
# TODO implement authors, side_identifier
|
||||
if 'title' in data:
|
||||
self.cleaned_data['title'] = data['title']
|
||||
if 'subtitle' in data:
|
||||
self.cleaned_data['subtitle'] = data['subtitle']
|
||||
if 'url' in data:
|
||||
# Fill the data
|
||||
self.cleaned_data['external_url'] = data['url']
|
||||
if 'number_of_pages' in data:
|
||||
self.cleaned_data['number_of_pages'] = \
|
||||
data['number_of_pages']
|
||||
if 'title' in data:
|
||||
self.cleaned_data['title'] = data['title']
|
||||
if 'subtitle' in data:
|
||||
self.cleaned_data['subtitle'] = data['subtitle']
|
||||
if 'number_of_pages' in data:
|
||||
self.cleaned_data['number_of_pages'] = \
|
||||
data['number_of_pages']
|
||||
return True
|
||||
return False
|
||||
|
||||
def clean(self):
|
||||
"""
|
||||
If user fetch ISBN data, then download data before validating the form
|
||||
"""
|
||||
# TODO implement authors, side_identifier
|
||||
if "_continue" in self.request.POST:
|
||||
isbn = self.cleaned_data.get('isbn')
|
||||
if isbn:
|
||||
# ISBN is present
|
||||
self.download_data(isbn)
|
||||
# ISBN is present, try with bedeteque
|
||||
scrap_result = self.download_data_bedeteque(isbn)
|
||||
if not scrap_result:
|
||||
# Try with OpenLibrary
|
||||
self.download_data_openlibrary(isbn)
|
||||
|
||||
return super().clean()
|
||||
|
90
media/scraper.py
Normal file
90
media/scraper.py
Normal file
@ -0,0 +1,90 @@
|
||||
# -*- mode: python; coding: utf-8 -*-
|
||||
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class BedetequeScraper:
|
||||
"""
|
||||
Tool to scrap bedeteque.com
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.session()
|
||||
self.referer = "https://www.bedetheque.com/search/albums"
|
||||
self.csrf_token = self._get_csrf_token()
|
||||
|
||||
def _get_csrf_token(self) -> str:
|
||||
"""
|
||||
Get CSRF token for session
|
||||
:return: CSRF token
|
||||
"""
|
||||
response = self.session.get(self.referer).content.decode()
|
||||
regex = r'csrf_token_bedetheque\"\s*value=\"(\w*)\"'
|
||||
return re.search(regex, response).group(1)
|
||||
|
||||
def search_by_isbn(self, isbn: str) -> [str]:
|
||||
"""
|
||||
Make the request and return
|
||||
:param isbn: ISBN we are searching
|
||||
:return: list of URLs matching ISBN
|
||||
"""
|
||||
scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \
|
||||
"&RechIdAuteur=undefined&csrf_token_bedetheque={}" \
|
||||
"&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \
|
||||
"&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \
|
||||
"&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \
|
||||
"&RechDLFin=&RechCoteMin=&RechCoteMax=" \
|
||||
"&RechEO=0".format(self.csrf_token, isbn)
|
||||
result = self.session.get(scrap_url, headers={'Referer': self.referer})
|
||||
content = result.content.decode()
|
||||
regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"'
|
||||
return re.findall(regex, content)
|
||||
|
||||
def scrap_bd_info(self, bd_url: str) -> dict:
|
||||
"""
|
||||
Load BD web page and scrap data
|
||||
:param bd_url: URL where to find BD data
|
||||
:return: BD data
|
||||
"""
|
||||
result = self.session.get(bd_url)
|
||||
content = result.content.decode().replace('\r', '')
|
||||
content = content.replace('\n', '')
|
||||
regex_title = r'<h1>\s*<a[^>]*>(.*)</a>\s*</h1>'
|
||||
regex_subtitle = r'<h2>\s*(.*)</h2>'
|
||||
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
|
||||
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
|
||||
|
||||
data = {
|
||||
'external_url': bd_url,
|
||||
}
|
||||
|
||||
# Get title
|
||||
search_title = re.search(regex_title, content)
|
||||
if search_title:
|
||||
data['title'] = search_title.group(1).strip()
|
||||
|
||||
# For subtitle, take serie number into account
|
||||
search_subtitle = re.search(regex_subtitle, content)
|
||||
if search_subtitle:
|
||||
subtitle = search_subtitle.group(1)
|
||||
subtitle = subtitle.replace('<span class="numa"></span>', '')
|
||||
data['subtitle'] = ' '.join(subtitle.split())
|
||||
|
||||
# TODO implement author
|
||||
# regex_author = r'author\">([^<]*)</span'
|
||||
# 'author': re.search(regex_author, content).group(1),
|
||||
|
||||
# Get publish date
|
||||
search_publish_date = re.search(regex_publish_date, content)
|
||||
if search_publish_date:
|
||||
data['publish_date'] = search_publish_date.group(1)
|
||||
|
||||
# Get number of pages
|
||||
search_nb_pages = re.search(regex_nb_of_pages, content)
|
||||
if search_nb_pages and search_nb_pages.group(1).isnumeric():
|
||||
data['number_of_pages'] = search_nb_pages.group(1)
|
||||
|
||||
return data
|
@ -1,6 +1,6 @@
|
||||
from rest_framework import serializers
|
||||
|
||||
from .models import Auteur, Media, Emprunt, Jeu
|
||||
from .models import Auteur, Emprunt, Jeu, Media
|
||||
|
||||
|
||||
class AuteurSerializer(serializers.HyperlinkedModelSerializer):
|
||||
|
@ -11,9 +11,9 @@ from django.utils.translation import gettext_lazy as _
|
||||
from rest_framework import viewsets
|
||||
from reversion import revisions as reversion
|
||||
|
||||
from .models import Auteur, Media, Emprunt, Jeu
|
||||
from .serializers import AuteurSerializer, MediaSerializer, \
|
||||
EmpruntSerializer, JeuSerializer
|
||||
from .models import Auteur, Emprunt, Jeu, Media
|
||||
from .serializers import AuteurSerializer, EmpruntSerializer, \
|
||||
JeuSerializer, MediaSerializer
|
||||
|
||||
|
||||
@login_required
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
from django.contrib import messages
|
||||
from django.contrib.auth.decorators import login_required, permission_required
|
||||
from django.contrib.auth.models import User, Group
|
||||
from django.contrib.auth.models import Group
|
||||
from django.db import transaction
|
||||
from django.shortcuts import redirect, render
|
||||
from django.template.context_processors import csrf
|
||||
@ -14,7 +14,7 @@ from reversion import revisions as reversion
|
||||
|
||||
from users.forms import BaseInfoForm
|
||||
from users.models import Adhesion, User
|
||||
from .serializers import UserSerializer, GroupSerializer
|
||||
from .serializers import GroupSerializer, UserSerializer
|
||||
|
||||
|
||||
def form(ctx, template, request):
|
||||
|
Loading…
Reference in New Issue
Block a user