Scrap from Google books API

2025-10-18 12:26:42 +02:00 · 2020-05-12 17:08:40 +02:00
parent 43b3b5ccfe
commit adbaf66401
1 changed files with 61 additions and 8 deletions
--- a/media/forms.py
+++ b/media/forms.py
@@ -8,6 +8,7 @@ import unicodedata
 import urllib.request
 from django.forms import ModelForm
 from django.utils.translation import gettext_lazy as _
 from .models import Auteur
 from .scraper import BedetequeScraper
@@ -35,6 +36,51 @@ class MediaAdminForm(ModelForm):
        self.cleaned_data.update(data)
        return True
    def download_data_google(self, isbn):
        """
        Download data from google books
        :return True if success
        """
        api_url = "https://www.googleapis.com/books/v1/volumes?q=isbn:{}".format(isbn)
        with urllib.request.urlopen(api_url) as url:
            data = json.loads(url.read().decode())
        if data and data['totalItems']:
            data = data['items'][0]
            # Fill the data
            self.parse_data_google(data)
            return True
        return False
    def parse_data_google(self, data):
        print(data)
        info = data['volumeInfo']
        self.cleaned_data['external_url'] = info['canonicalVolumeLink']
        if 'title' in info:
            self.cleaned_data['title'] = info['title']
        if 'subtitle' in data:
            self.cleaned_data['subtitle'] = info['subtitle']
        if 'pageCount' in info:
            self.cleaned_data['number_of_pages'] = \
                info['pageCount']
        elif not self.cleaned_data['number_of_pages']:
            self.cleaned_data['number_of_pages'] = 0
        if 'publishedDate' in info:
            self.cleaned_data['publish_date'] = info['publishedDate']
        if 'authors' not in self.cleaned_data:
            self.cleaned_data['authors'] = list()
        if 'authors' in info:
            for author in info['authors']:
                author_obj = Auteur.objects.get_or_create(
                    name=author)[0]
                self.cleaned_data['authors'].append(author_obj)
        print(self.cleaned_data)
    def download_data_openlibrary(self, isbn):
        """
        Download data from openlibrary
@@ -54,7 +100,6 @@ class MediaAdminForm(ModelForm):
        return False
    def parse_data_openlibrary(self, data):
        print(data)
        self.cleaned_data['external_url'] = data['url']
        if 'title' in data:
            self.cleaned_data['title'] = data['title']
@@ -89,18 +134,19 @@ class MediaAdminForm(ModelForm):
                    Nov="11",
                    Dec="12",
                )
-                if split[0] in month_to_number:
+                if split[0][:3] in month_to_number:
                    self.cleaned_data['publish_date']\
                        = split[2] + "-" \
-                        + month_to_number[split[0]] + "-" + split[1]
+                        + month_to_number[split[0][:3]] + "-" + split[1]
                else:
                    self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \
                        .format(split[2], months.index(split[0])
                                + 1, int(split[1]), )
        if 'authors' not in self.cleaned_data:
            self.cleaned_data['authors'] = list()
        if 'authors' in data:
            if 'author' not in self.cleaned_data:
                self.cleaned_data['authors'] = list()
            for author in data['authors']:
                author_obj = Auteur.objects.get_or_create(
                    name=author['name'])[0]
@@ -122,8 +168,13 @@ class MediaAdminForm(ModelForm):
                # ISBN is present, try with bedeteque
                scrap_result = self.download_data_bedeteque(isbn)
                if not scrap_result:
-                    # Try with OpenLibrary
+                    # Try with Fnac
-                    self.download_data_openlibrary(isbn)
+                    scrap_result = self.download_data_google(isbn)
                    if not scrap_result:
                        # Try with OpenLibrary
                        if not self.download_data_openlibrary(isbn):
                            self.add_error('isbn', _("This ISBN is not found."))
                            return self.cleaned_data
                if self.cleaned_data['title']:
                    self.cleaned_data['title'] = re.sub(
@@ -183,6 +234,8 @@ class MediaAdminForm(ModelForm):
                    ).casefold().upper()
                    self.cleaned_data['side_identifier'] = side_identifier
        print(self.cleaned_data)
        return self.cleaned_data
    def _clean_fields(self):
@@ -198,7 +251,7 @@ class MediaAdminForm(ModelForm):
            from django.core.exceptions import ValidationError
            try:
                # We don't want to check a field when we enter an ISBN.
-                if "_isbn" not in self.request.POST \
+                if "isbn" not in self.request.POST \
                        or not self.cleaned_data.get('isbn'):
                    value = field.clean(value)
                self.cleaned_data[name] = value