med/media/scraper.py

# -*- mode: python; coding: utf-8 -*-
# SPDX-License-Identifier: GPL-3.0-or-later

import re

import requests

from media.models import Auteur


class BedetequeScraper:
    """
    Tool to scrap bedeteque.com
    """

    def __init__(self):
        self.session = requests.session()
        self.referer = "https://www.bedetheque.com/search/albums"
        self.csrf_token = self._get_csrf_token()

    def _get_csrf_token(self) -> str:
        """
        Get CSRF token for session
        :return: CSRF token
        """
        response = self.session.get(self.referer).content.decode()
        regex = r'csrf_token_bedetheque\"\s*value=\"(\w*)\"'
        return re.search(regex, response).group(1)

    def search_by_isbn(self, isbn: str) -> [str]:
        """
        Make the request and return
        :param isbn: ISBN we are searching
        :return: list of URLs matching ISBN
        """
        scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \
                    "&RechIdAuteur=undefined&csrf_token_bedetheque={}" \
                    "&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \
                    "&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \
                    "&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \
                    "&RechDLFin=&RechCoteMin=&RechCoteMax=" \
                    "&RechEO=0".format(self.csrf_token, isbn)
        result = self.session.get(scrap_url, headers={'Referer': self.referer})
        content = result.content.decode()
        regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"'
        return re.findall(regex, content)

    def scrap_bd_info(self, bd_url: str) -> dict:
        """
        Load BD web page and scrap data
        :param bd_url: URL where to find BD data
        :return: BD data
        """
        result = self.session.get(bd_url)
        content = result.content.decode().replace('\r', '')
        content = content.replace('\n', '')
        regex_title = r'<h1>\s*<a[^>]*>(.*)</a>\s*</h1>'
        regex_subtitle = r'<h2>\s*(.*)</h2>'
        regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
        regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
        regex_format = r'<label>Format : </label>Format (\w+)</li>'
        regex_author = r'<span itemprop=\"author\">(((?!<).)*)</span>'
        regex_illustrator = r'span itemprop=\"illustrator\">(((?!<).)*)</span'

        data = {
            'external_url': bd_url,
        }

        # Get title
        search_title = re.search(regex_title, content)
        if search_title:
            data['title'] = search_title.group(1).strip()

        # For subtitle, take serie number into account
        search_subtitle = re.search(regex_subtitle, content)
        if search_subtitle:
            subtitle = search_subtitle.group(1)
            subtitle = subtitle.replace('<span class="numa"></span>', '')
            data['subtitle'] = ' '.join(subtitle.split())

        # Get publish date
        search_publish_date = re.search(regex_publish_date, content)
        if search_publish_date:
            data['publish_date'] = search_publish_date.group(1)

        # Get number of pages
        search_nb_pages = re.search(regex_nb_of_pages, content)
        if search_nb_pages and search_nb_pages.group(1).isnumeric():
            data['number_of_pages'] = search_nb_pages.group(1)
        elif 'number_of_pages' not in data:
            data['number_of_pages'] = 0

        # Get format of the book
        search_format = re.search(regex_format, content)
        if search_format:
            data['format'] = search_format.group(1).lower()

        # Get author and illustrator
        author = re.search(regex_author, content)
        if 'author' not in data:
            data['authors'] = list()
        if author:
            author_obj = Auteur.objects.get_or_create(
                name=author.group(1))[0]
            data['authors'].append(author_obj)
        illustrator = re.search(regex_illustrator, content)
        if illustrator:
            author_obj = Auteur.objects.get_or_create(
                name=illustrator.group(1))[0]
            data['authors'].append(author_obj)

        return data
Add bedeteque scraper 2019-08-19 12:17:49 +02:00			`# -- mode: python; coding: utf-8 --`
			`# SPDX-License-Identifier: GPL-3.0-or-later`

			`import re`

			`import requests`

Fix authors & openlibrary scrap 2020-02-10 11:29:26 +01:00			`from media.models import Auteur`

Add bedeteque scraper 2019-08-19 12:17:49 +02:00
			`class BedetequeScraper:`
			`"""`
			`Tool to scrap bedeteque.com`
			`"""`

			`def __init__(self):`
			`self.session = requests.session()`
			`self.referer = "https://www.bedetheque.com/search/albums"`
			`self.csrf_token = self._get_csrf_token()`

			`def _get_csrf_token(self) -> str:`
			`"""`
			`Get CSRF token for session`
			`:return: CSRF token`
			`"""`
			`response = self.session.get(self.referer).content.decode()`
			`regex = r'csrf_token_bedetheque\"\svalue=\"(\w)\"'`
			`return re.search(regex, response).group(1)`

			`def search_by_isbn(self, isbn: str) -> [str]:`
			`"""`
			`Make the request and return`
			`:param isbn: ISBN we are searching`
			`:return: list of URLs matching ISBN`
			`"""`
			`scrap_url = "https://www.bedetheque.com/search/albums?RechIdSerie=" \`
			`"&RechIdAuteur=undefined&csrf_token_bedetheque={}" \`
			`"&RechSerie=&RechTitre=&RechEditeur=&RechCollection=" \`
			`"&RechStyle=&RechAuteur=&RechISBN={}&RechParution=" \`
			`"&RechOrigine=&RechLangue=&RechMotCle=&RechDLDeb=" \`
			`"&RechDLFin=&RechCoteMin=&RechCoteMax=" \`
			`"&RechEO=0".format(self.csrf_token, isbn)`
			`result = self.session.get(scrap_url, headers={'Referer': self.referer})`
			`content = result.content.decode()`
			`regex = r'href=\"(https://www\.bedetheque\.com/BD.*.html)\"'`
			`return re.findall(regex, content)`

			`def scrap_bd_info(self, bd_url: str) -> dict:`
			`"""`
			`Load BD web page and scrap data`
			`:param bd_url: URL where to find BD data`
			`:return: BD data`
			`"""`
			`result = self.session.get(bd_url)`
			`content = result.content.decode().replace('\r', '')`
			`content = content.replace('\n', '')`
			`regex_title = r'<h1>\s<a[^>]>(.)</a>\s</h1>'`
			`regex_subtitle = r'<h2>\s(.)</h2>'`
			`regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'`
			`regex_nb_of_pages = r'numberOfPages\">(\d*)</span'`
Split comic strips and mangas 2020-05-21 16:56:41 +02:00			`regex_format = r'<label>Format : </label>Format (\w+)</li>'`
Scrap author and illustrator (may not work for some books) 2020-02-10 03:58:52 +01:00			`regex_author = r'<span itemprop=\"author\">(((?!<).)*)</span>'`
One line was too long 2020-02-10 05:06:24 +01:00			`regex_illustrator = r'span itemprop=\"illustrator\">(((?!<).)*)</span'`
Add bedeteque scraper 2019-08-19 12:17:49 +02:00
			`data = {`
			`'external_url': bd_url,`
			`}`

			`# Get title`
			`search_title = re.search(regex_title, content)`
			`if search_title:`
			`data['title'] = search_title.group(1).strip()`

			`# For subtitle, take serie number into account`
			`search_subtitle = re.search(regex_subtitle, content)`
			`if search_subtitle:`
			`subtitle = search_subtitle.group(1)`
			`subtitle = subtitle.replace('<span class="numa"></span>', '')`
			`data['subtitle'] = ' '.join(subtitle.split())`

			`# Get publish date`
			`search_publish_date = re.search(regex_publish_date, content)`
			`if search_publish_date:`
			`data['publish_date'] = search_publish_date.group(1)`

			`# Get number of pages`
			`search_nb_pages = re.search(regex_nb_of_pages, content)`
			`if search_nb_pages and search_nb_pages.group(1).isnumeric():`
			`data['number_of_pages'] = search_nb_pages.group(1)`
Fix page numbers 2020-02-10 16:32:13 +01:00			`elif 'number_of_pages' not in data:`
			`data['number_of_pages'] = 0`
Add bedeteque scraper 2019-08-19 12:17:49 +02:00
Split comic strips and mangas 2020-05-21 16:56:41 +02:00			`# Get format of the book`
			`search_format = re.search(regex_format, content)`
			`if search_format:`
			`data['format'] = search_format.group(1).lower()`

Scrap author and illustrator (may not work for some books) 2020-02-10 03:58:52 +01:00			`# Get author and illustrator`
			`author = re.search(regex_author, content)`
Pipelines 2020-02-10 04:59:39 +01:00			`if 'author' not in data:`
Fix authors & openlibrary scrap 2020-02-10 11:29:26 +01:00			`data['authors'] = list()`
Scrap author and illustrator (may not work for some books) 2020-02-10 03:58:52 +01:00			`if author:`
Fix authors & openlibrary scrap 2020-02-10 11:29:26 +01:00			`author_obj = Auteur.objects.get_or_create(`
			`name=author.group(1))[0]`
			`data['authors'].append(author_obj)`
Scrap author and illustrator (may not work for some books) 2020-02-10 03:58:52 +01:00			`illustrator = re.search(regex_illustrator, content)`
			`if illustrator:`
Fix authors & openlibrary scrap 2020-02-10 11:29:26 +01:00			`author_obj = Auteur.objects.get_or_create(`
			`name=illustrator.group(1))[0]`
			`data['authors'].append(author_obj)`
Scrap author and illustrator (may not work for some books) 2020-02-10 03:58:52 +01:00
Add bedeteque scraper 2019-08-19 12:17:49 +02:00			`return data`