114 lines
4.2 KiB
Python
114 lines
4.2 KiB
Python
# -*- mode: python; coding: utf-8 -*-
|
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
import re
|
|
|
|
import requests
|
|
|
|
|
|
class BedetequeScraper:
|
|
"""
|
|
Tool to scrap bedeteque.com
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.session = requests.session()
|
|
self.referer = "https://www.bedetheque.com/search/albums"
|
|
self.csrf_token = self._get_csrf_token()
|
|
|
|
def _get_csrf_token(self) -> str:
|
|
"""
|
|
Get CSRF token for session
|
|
:return: CSRF token
|
|
"""
|
|
response = self.session.get(self.referer).content.decode()
|
|
regex = r'csrf_token_bedetheque\"\s*value=\"(\w*)\"'
|
|
return re.search(regex, response).group(1)
|
|
|
|
def search_by_isbn(self, isbn: str) -> [str]:
|
|
"""
|
|
Make the request and return
|
|
:param isbn: ISBN we are searching
|
|
:return: list of URLs matching ISBN
|
|
"""
|
|
scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \
|
|
"&RechIdAuteur=undefined&csrf_token_bedetheque={}" \
|
|
"&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \
|
|
"&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \
|
|
"&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \
|
|
"&RechDLFin=&RechCoteMin=&RechCoteMax=" \
|
|
"&RechEO=0".format(self.csrf_token, isbn)
|
|
result = self.session.get(scrap_url, headers={'Referer': self.referer})
|
|
content = result.content.decode()
|
|
regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"'
|
|
return re.findall(regex, content)
|
|
|
|
def scrap_bd_info(self, bd_url: str) -> dict:
|
|
"""
|
|
Load BD web page and scrap data
|
|
:param bd_url: URL where to find BD data
|
|
:return: BD data
|
|
"""
|
|
result = self.session.get(bd_url)
|
|
content = result.content.decode().replace('\r', '')
|
|
content = content.replace('\n', '')
|
|
regex_title = r'<h1>\s*<a[^>]*>(.*)</a>\s*</h1>'
|
|
regex_subtitle = r'<h2>\s*(.*)</h2>'
|
|
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
|
|
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
|
|
regex_author = r'<span itemprop=\"author\">(((?!<).)*)</span>'
|
|
regex_illustrator = r'<span itemprop=\"illustrator\">(((?!<).)*)</span>'
|
|
|
|
data = {
|
|
'external_url': bd_url,
|
|
}
|
|
|
|
# Get title
|
|
search_title = re.search(regex_title, content)
|
|
if search_title:
|
|
data['title'] = search_title.group(1).strip()
|
|
|
|
# For subtitle, take serie number into account
|
|
search_subtitle = re.search(regex_subtitle, content)
|
|
if search_subtitle:
|
|
subtitle = search_subtitle.group(1)
|
|
subtitle = subtitle.replace('<span class="numa"></span>', '')
|
|
data['subtitle'] = ' '.join(subtitle.split())
|
|
|
|
# TODO implement author
|
|
# regex_author = r'author\">([^<]*)</span'
|
|
# 'author': re.search(regex_author, content).group(1),
|
|
|
|
# Get publish date
|
|
search_publish_date = re.search(regex_publish_date, content)
|
|
if search_publish_date:
|
|
data['publish_date'] = search_publish_date.group(1)
|
|
|
|
# Get number of pages
|
|
search_nb_pages = re.search(regex_nb_of_pages, content)
|
|
if search_nb_pages and search_nb_pages.group(1).isnumeric():
|
|
data['number_of_pages'] = search_nb_pages.group(1)
|
|
|
|
# Get author and illustrator
|
|
author = re.search(regex_author, content)
|
|
if not 'author' in data:
|
|
data['author'] = list()
|
|
if author:
|
|
data['author'].append(author.group(1))
|
|
illustrator = re.search(regex_illustrator, content)
|
|
if illustrator:
|
|
data['author'].append(illustrator.group(1))
|
|
|
|
author_name = data['author'][0]
|
|
if ',' not in author_name:
|
|
author_name = author_name.split(' ')[1]
|
|
side_identifier = "{:.3} {:.3}".format(author_name.upper(), data['title'].upper(),)
|
|
if data['subtitle']:
|
|
start = data['subtitle'].split(' ')[0].replace('.', '')
|
|
print("start:", start)
|
|
if start.isnumeric():
|
|
side_identifier += " {:0>2}".format(start,)
|
|
data['side_identifier'] = side_identifier
|
|
|
|
return data
|