1
0
mirror of https://gitlab.crans.org/mediatek/med.git synced 2025-01-01 00:22:22 +00:00
med/media/scraper.py

113 lines
4.1 KiB
Python
Raw Normal View History

2019-08-19 10:17:49 +00:00
# -*- mode: python; coding: utf-8 -*-
# SPDX-License-Identifier: GPL-3.0-or-later
import re
import requests
2020-02-10 10:29:26 +00:00
from media.models import Auteur
2019-08-19 10:17:49 +00:00
class BedetequeScraper:
"""
Tool to scrap bedeteque.com
"""
def __init__(self):
self.session = requests.session()
self.referer = "https://www.bedetheque.com/search/albums"
self.csrf_token = self._get_csrf_token()
def _get_csrf_token(self) -> str:
"""
Get CSRF token for session
:return: CSRF token
"""
response = self.session.get(self.referer).content.decode()
regex = r'csrf_token_bedetheque\"\s*value=\"(\w*)\"'
return re.search(regex, response).group(1)
def search_by_isbn(self, isbn: str) -> [str]:
"""
Make the request and return
:param isbn: ISBN we are searching
:return: list of URLs matching ISBN
"""
scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \
"&RechIdAuteur=undefined&csrf_token_bedetheque={}" \
"&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \
"&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \
"&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \
"&RechDLFin=&RechCoteMin=&RechCoteMax=" \
"&RechEO=0".format(self.csrf_token, isbn)
result = self.session.get(scrap_url, headers={'Referer': self.referer})
content = result.content.decode()
regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"'
return re.findall(regex, content)
def scrap_bd_info(self, bd_url: str) -> dict:
"""
Load BD web page and scrap data
:param bd_url: URL where to find BD data
:return: BD data
"""
result = self.session.get(bd_url)
content = result.content.decode().replace('\r', '')
content = content.replace('\n', '')
regex_title = r'<h1>\s*<a[^>]*>(.*)</a>\s*</h1>'
regex_subtitle = r'<h2>\s*(.*)</h2>'
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
2020-05-21 14:56:41 +00:00
regex_format = r'<label>Format : </label>Format (\w+)</li>'
regex_author = r'<span itemprop=\"author\">(((?!<).)*)</span>'
2020-02-10 04:06:24 +00:00
regex_illustrator = r'span itemprop=\"illustrator\">(((?!<).)*)</span'
2019-08-19 10:17:49 +00:00
data = {
'external_url': bd_url,
}
# Get title
search_title = re.search(regex_title, content)
if search_title:
data['title'] = search_title.group(1).strip()
# For subtitle, take serie number into account
search_subtitle = re.search(regex_subtitle, content)
if search_subtitle:
subtitle = search_subtitle.group(1)
subtitle = subtitle.replace('<span class="numa"></span>', '')
data['subtitle'] = ' '.join(subtitle.split())
# Get publish date
search_publish_date = re.search(regex_publish_date, content)
if search_publish_date:
data['publish_date'] = search_publish_date.group(1)
# Get number of pages
search_nb_pages = re.search(regex_nb_of_pages, content)
if search_nb_pages and search_nb_pages.group(1).isnumeric():
data['number_of_pages'] = search_nb_pages.group(1)
2020-02-10 15:32:13 +00:00
elif 'number_of_pages' not in data:
data['number_of_pages'] = 0
2019-08-19 10:17:49 +00:00
2020-05-21 14:56:41 +00:00
# Get format of the book
search_format = re.search(regex_format, content)
if search_format:
data['format'] = search_format.group(1).lower()
# Get author and illustrator
author = re.search(regex_author, content)
2020-02-10 03:59:39 +00:00
if 'author' not in data:
2020-02-10 10:29:26 +00:00
data['authors'] = list()
if author:
2020-02-10 10:29:26 +00:00
author_obj = Auteur.objects.get_or_create(
name=author.group(1))[0]
data['authors'].append(author_obj)
illustrator = re.search(regex_illustrator, content)
if illustrator:
2020-02-10 10:29:26 +00:00
author_obj = Auteur.objects.get_or_create(
name=illustrator.group(1))[0]
data['authors'].append(author_obj)
2019-08-19 10:17:49 +00:00
return data