# -*- mode: python; coding: utf-8 -*- # SPDX-License-Identifier: GPL-3.0-or-later import re import requests class BedetequeScraper: """ Tool to scrap bedeteque.com """ def __init__(self): self.session = requests.session() self.referer = "https://www.bedetheque.com/search/albums" self.csrf_token = self._get_csrf_token() def _get_csrf_token(self) -> str: """ Get CSRF token for session :return: CSRF token """ response = self.session.get(self.referer).content.decode() regex = r'csrf_token_bedetheque\"\s*value=\"(\w*)\"' return re.search(regex, response).group(1) def search_by_isbn(self, isbn: str) -> [str]: """ Make the request and return :param isbn: ISBN we are searching :return: list of URLs matching ISBN """ scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \ "&RechIdAuteur=undefined&csrf_token_bedetheque={}" \ "&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \ "&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \ "&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \ "&RechDLFin=&RechCoteMin=&RechCoteMax=" \ "&RechEO=0".format(self.csrf_token, isbn) result = self.session.get(scrap_url, headers={'Referer': self.referer}) content = result.content.decode() regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"' return re.findall(regex, content) def scrap_bd_info(self, bd_url: str) -> dict: """ Load BD web page and scrap data :param bd_url: URL where to find BD data :return: BD data """ result = self.session.get(bd_url) content = result.content.decode().replace('\r', '') content = content.replace('\n', '') regex_title = r'

\s*]*>(.*)\s*

' regex_subtitle = r'

\s*(.*)

' regex_publish_date = r'datePublished\" content=\"([\d-]*)\">' regex_nb_of_pages = r'numberOfPages\">(\d*)(((?!<).)*)' regex_illustrator = r'(((?!<).)*)' data = { 'external_url': bd_url, } # Get title search_title = re.search(regex_title, content) if search_title: data['title'] = search_title.group(1).strip() # For subtitle, take serie number into account search_subtitle = re.search(regex_subtitle, content) if search_subtitle: subtitle = search_subtitle.group(1) subtitle = subtitle.replace('', '') data['subtitle'] = ' '.join(subtitle.split()) # TODO implement author # regex_author = r'author\">([^<]*)