# -*- mode: python; coding: utf-8 -*- # SPDX-License-Identifier: GPL-3.0-or-later import re import requests from media.models import Auteur class BedetequeScraper: """ Tool to scrap bedeteque.com """ def __init__(self): self.session = requests.session() self.referer = "https://www.bedetheque.com/search/albums" self.csrf_token = self._get_csrf_token() def _get_csrf_token(self) -> str: """ Get CSRF token for session :return: CSRF token """ response = self.session.get(self.referer).content.decode() regex = r'csrf_token_bel\"\s*value=\"(\w*)\"' return re.search(regex, response).group(1) def search_by_isbn(self, isbn: str) -> [str]: """ Make the request and return :param isbn: ISBN we are searching :return: list of URLs matching ISBN """ scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \ "&RechIdAuteur=undefined&csrf_token_bedetheque={}" \ "&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \ "&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \ "&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \ "&RechDLFin=&RechCoteMin=&RechCoteMax=" \ "&RechEO=0".format(self.csrf_token, isbn) result = self.session.get(scrap_url, headers={'Referer': self.referer}) content = result.content.decode() regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"' return re.findall(regex, content) def scrap_bd_info(self, bd_url: str) -> dict: """ Load BD web page and scrap data :param bd_url: URL where to find BD data :return: BD data """ result = self.session.get(bd_url) content = result.content.decode().replace('\r', '') content = content.replace('\n', '') regex_title = r'