mirror of
https://gitlab.crans.org/mediatek/med.git
synced 2024-11-26 20:47:10 +00:00
Add bedeteque scraper
This commit is contained in:
parent
d6df704d09
commit
a6b5d8b822
@ -7,6 +7,8 @@ import urllib.request
|
|||||||
|
|
||||||
from django.forms import ModelForm
|
from django.forms import ModelForm
|
||||||
|
|
||||||
|
from .scraper import BedetequeScraper
|
||||||
|
|
||||||
|
|
||||||
class MediaAdminForm(ModelForm):
|
class MediaAdminForm(ModelForm):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
@ -16,9 +18,24 @@ class MediaAdminForm(ModelForm):
|
|||||||
isbn_field.widget.template_name = "media/isbn_button.html"
|
isbn_field.widget.template_name = "media/isbn_button.html"
|
||||||
isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
|
isbn_field.widget.attrs.update({'autofocus': 'autofocus'})
|
||||||
|
|
||||||
def download_data(self, isbn):
|
def download_data_bedeteque(self, isbn):
|
||||||
"""
|
"""
|
||||||
Download data from ISBN
|
Download data from bedeteque
|
||||||
|
:return True if success
|
||||||
|
"""
|
||||||
|
scraper = BedetequeScraper()
|
||||||
|
r = scraper.search_by_isbn(isbn)
|
||||||
|
if not r:
|
||||||
|
return False
|
||||||
|
# If results, then take the most accurate
|
||||||
|
data = scraper.scrap_bd_info(r[0])
|
||||||
|
self.cleaned_data.update(data)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def download_data_openlibrary(self, isbn):
|
||||||
|
"""
|
||||||
|
Download data from openlibrary
|
||||||
|
:return True if success
|
||||||
"""
|
"""
|
||||||
api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
|
api_url = "https://openlibrary.org/api/books?bibkeys=ISBN:{}" \
|
||||||
"&format=json&jscmd=data".format(isbn)
|
"&format=json&jscmd=data".format(isbn)
|
||||||
@ -26,27 +43,31 @@ class MediaAdminForm(ModelForm):
|
|||||||
data = json.loads(url.read().decode())
|
data = json.loads(url.read().decode())
|
||||||
if data and data['ISBN:' + isbn]:
|
if data and data['ISBN:' + isbn]:
|
||||||
data = data['ISBN:' + isbn]
|
data = data['ISBN:' + isbn]
|
||||||
|
if 'url' in data:
|
||||||
# Fill the data
|
# Fill the data
|
||||||
# TODO implement authors, side_identifier
|
self.cleaned_data['external_url'] = data['url']
|
||||||
if 'title' in data:
|
if 'title' in data:
|
||||||
self.cleaned_data['title'] = data['title']
|
self.cleaned_data['title'] = data['title']
|
||||||
if 'subtitle' in data:
|
if 'subtitle' in data:
|
||||||
self.cleaned_data['subtitle'] = data['subtitle']
|
self.cleaned_data['subtitle'] = data['subtitle']
|
||||||
if 'url' in data:
|
|
||||||
self.cleaned_data['external_url'] = data['url']
|
|
||||||
if 'number_of_pages' in data:
|
if 'number_of_pages' in data:
|
||||||
self.cleaned_data['number_of_pages'] = \
|
self.cleaned_data['number_of_pages'] = \
|
||||||
data['number_of_pages']
|
data['number_of_pages']
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def clean(self):
|
def clean(self):
|
||||||
"""
|
"""
|
||||||
If user fetch ISBN data, then download data before validating the form
|
If user fetch ISBN data, then download data before validating the form
|
||||||
"""
|
"""
|
||||||
|
# TODO implement authors, side_identifier
|
||||||
if "_continue" in self.request.POST:
|
if "_continue" in self.request.POST:
|
||||||
isbn = self.cleaned_data.get('isbn')
|
isbn = self.cleaned_data.get('isbn')
|
||||||
if isbn:
|
if isbn:
|
||||||
# ISBN is present
|
# ISBN is present, try with bedeteque
|
||||||
self.download_data(isbn)
|
scrap_result = self.download_data_bedeteque(isbn)
|
||||||
|
if not scrap_result:
|
||||||
|
# Try with OpenLibrary
|
||||||
|
self.download_data_openlibrary(isbn)
|
||||||
|
|
||||||
return super().clean()
|
return super().clean()
|
||||||
|
90
media/scraper.py
Normal file
90
media/scraper.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
# -*- mode: python; coding: utf-8 -*-
|
||||||
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
class BedetequeScraper:
|
||||||
|
"""
|
||||||
|
Tool to scrap bedeteque.com
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.session = requests.session()
|
||||||
|
self.referer = "https://www.bedetheque.com/search/albums"
|
||||||
|
self.csrf_token = self._get_csrf_token()
|
||||||
|
|
||||||
|
def _get_csrf_token(self) -> str:
|
||||||
|
"""
|
||||||
|
Get CSRF token for session
|
||||||
|
:return: CSRF token
|
||||||
|
"""
|
||||||
|
response = self.session.get(self.referer).content.decode()
|
||||||
|
regex = r'csrf_token_bedetheque\"\s*value=\"(\w*)\"'
|
||||||
|
return re.search(regex, response).group(1)
|
||||||
|
|
||||||
|
def search_by_isbn(self, isbn: str) -> [str]:
|
||||||
|
"""
|
||||||
|
Make the request and return
|
||||||
|
:param isbn: ISBN we are searching
|
||||||
|
:return: list of URLs matching ISBN
|
||||||
|
"""
|
||||||
|
scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \
|
||||||
|
"&RechIdAuteur=undefined&csrf_token_bedetheque={}" \
|
||||||
|
"&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \
|
||||||
|
"&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \
|
||||||
|
"&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \
|
||||||
|
"&RechDLFin=&RechCoteMin=&RechCoteMax=" \
|
||||||
|
"&RechEO=0".format(self.csrf_token, isbn)
|
||||||
|
result = self.session.get(scrap_url, headers={'Referer': self.referer})
|
||||||
|
content = result.content.decode()
|
||||||
|
regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"'
|
||||||
|
return re.findall(regex, content)
|
||||||
|
|
||||||
|
def scrap_bd_info(self, bd_url: str) -> dict:
|
||||||
|
"""
|
||||||
|
Load BD web page and scrap data
|
||||||
|
:param bd_url: URL where to find BD data
|
||||||
|
:return: BD data
|
||||||
|
"""
|
||||||
|
result = self.session.get(bd_url)
|
||||||
|
content = result.content.decode().replace('\r', '')
|
||||||
|
content = content.replace('\n', '')
|
||||||
|
regex_title = r'<h1>\s*<a[^>]*>(.*)</a>\s*</h1>'
|
||||||
|
regex_subtitle = r'<h2>\s*(.*)</h2>'
|
||||||
|
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
|
||||||
|
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'external_url': bd_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get title
|
||||||
|
search_title = re.search(regex_title, content)
|
||||||
|
if search_title:
|
||||||
|
data['title'] = search_title.group(1).strip()
|
||||||
|
|
||||||
|
# For subtitle, take serie number into account
|
||||||
|
search_subtitle = re.search(regex_subtitle, content)
|
||||||
|
if search_subtitle:
|
||||||
|
subtitle = search_subtitle.group(1)
|
||||||
|
subtitle = subtitle.replace('<span class="numa"></span>', '')
|
||||||
|
data['subtitle'] = ' '.join(subtitle.split())
|
||||||
|
|
||||||
|
# TODO implement author
|
||||||
|
# regex_author = r'author\">([^<]*)</span'
|
||||||
|
# 'author': re.search(regex_author, content).group(1),
|
||||||
|
|
||||||
|
# Get publish date
|
||||||
|
search_publish_date = re.search(regex_publish_date, content)
|
||||||
|
if search_publish_date:
|
||||||
|
data['publish_date'] = search_publish_date.group(1)
|
||||||
|
|
||||||
|
# Get number of pages
|
||||||
|
search_nb_pages = re.search(regex_nb_of_pages, content)
|
||||||
|
if search_nb_pages and search_nb_pages.group(1).isnumeric():
|
||||||
|
data['number_of_pages'] = search_nb_pages.group(1)
|
||||||
|
|
||||||
|
return data
|
@ -1,6 +1,6 @@
|
|||||||
from rest_framework import serializers
|
from rest_framework import serializers
|
||||||
|
|
||||||
from .models import Auteur, Media, Emprunt, Jeu
|
from .models import Auteur, Emprunt, Jeu, Media
|
||||||
|
|
||||||
|
|
||||||
class AuteurSerializer(serializers.HyperlinkedModelSerializer):
|
class AuteurSerializer(serializers.HyperlinkedModelSerializer):
|
||||||
|
@ -11,9 +11,9 @@ from django.utils.translation import gettext_lazy as _
|
|||||||
from rest_framework import viewsets
|
from rest_framework import viewsets
|
||||||
from reversion import revisions as reversion
|
from reversion import revisions as reversion
|
||||||
|
|
||||||
from .models import Auteur, Media, Emprunt, Jeu
|
from .models import Auteur, Emprunt, Jeu, Media
|
||||||
from .serializers import AuteurSerializer, MediaSerializer, \
|
from .serializers import AuteurSerializer, EmpruntSerializer, \
|
||||||
EmpruntSerializer, JeuSerializer
|
JeuSerializer, MediaSerializer
|
||||||
|
|
||||||
|
|
||||||
@login_required
|
@login_required
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
from django.contrib import messages
|
from django.contrib import messages
|
||||||
from django.contrib.auth.decorators import login_required, permission_required
|
from django.contrib.auth.decorators import login_required, permission_required
|
||||||
from django.contrib.auth.models import User, Group
|
from django.contrib.auth.models import Group
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
from django.shortcuts import redirect, render
|
from django.shortcuts import redirect, render
|
||||||
from django.template.context_processors import csrf
|
from django.template.context_processors import csrf
|
||||||
@ -14,7 +14,7 @@ from reversion import revisions as reversion
|
|||||||
|
|
||||||
from users.forms import BaseInfoForm
|
from users.forms import BaseInfoForm
|
||||||
from users.models import Adhesion, User
|
from users.models import Adhesion, User
|
||||||
from .serializers import UserSerializer, GroupSerializer
|
from .serializers import GroupSerializer, UserSerializer
|
||||||
|
|
||||||
|
|
||||||
def form(ctx, template, request):
|
def form(ctx, template, request):
|
||||||
|
Loading…
Reference in New Issue
Block a user