# -*- mode: python; coding: utf-8 -*- # SPDX-License-Identifier: GPL-3.0-or-later import re import requests from media.models import Author class BedetequeScraper: """ Tool to scrap bedeteque.com """ def __init__(self): self.session = requests.session() self.referer = "https://www.bedetheque.com/search/albums" self.csrf_token = self._get_csrf_token() def _get_csrf_token(self) -> str: """ Get CSRF token for session :return: CSRF token """ response = self.session.get(self.referer).content.decode() regex = r'csrf_token_bel\"\s*value=\"(\w*)\"' return re.search(regex, response).group(1) def search_by_isbn(self, isbn: str) -> [str]: """ Make the request and return :param isbn: ISBN we are searching :return: list of URLs matching ISBN """ scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \ "&RechIdAuteur=undefined&csrf_token_bedetheque={}" \ "&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \ "&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \ "&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \ "&RechDLFin=&RechCoteMin=&RechCoteMax=" \ "&RechEO=0".format(self.csrf_token, isbn) result = self.session.get(scrap_url, headers={'Referer': self.referer}) content = result.content.decode() regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"' return re.findall(regex, content) def scrap_comic_info(self, bd_url: str) -> dict: """ Load BD web page and scrap data :param bd_url: URL where to find BD data :return: BD data """ result = self.session.get(bd_url) content = result.content.decode().replace('\r', '') content = content.replace('\n', '') regex_title = r'

\s]>(.)\s

' regex_subtitle = r'

\s(.)

' regex_publish_date = r'datePublished\" content=\"([\d-]*)\">' regex_nb_of_pages = r'numberOfPages\">(\d*)Format : Format (\w+)' regex_author = r'(((?!<).)*)' regex_illustrator = r'span itemprop=\"illustrator\">(((?!<).)*)', '') data['subtitle'] = ' '.join(subtitle.split()) # Get publish date search_publish_date = re.search(regex_publish_date, content) if search_publish_date: data['publish_date'] = search_publish_date.group(1) # Get number of pages search_nb_pages = re.search(regex_nb_of_pages, content) if search_nb_pages and search_nb_pages.group(1).isnumeric(): data['number_of_pages'] = search_nb_pages.group(1) elif 'number_of_pages' not in data: data['number_of_pages'] = 0 # Get format of the book search_format = re.search(regex_format, content) if search_format: data['format'] = search_format.group(1).lower() # Get author and illustrator author = re.search(regex_author, content) if 'author' not in data: data['authors'] = list() if author: author_obj = Author.objects.get_or_create( name=author.group(1))[0] data['authors'].append(author_obj) illustrator = re.search(regex_illustrator, content) if illustrator: author_obj = Author.objects.get_or_create( name=illustrator.group(1))[0] data['authors'].append(author_obj) return data

\s*]*>(.*)\s*

\s*(.*)

\s]>(.)\s

\s(.)