Scrap from Google books API

2025-12-14 04:45:14 +01:00 · 2020-05-12 17:08:40 +02:00
parent 43b3b5ccfe
commit adbaf66401
1 changed files with 61 additions and 8 deletions
--- a/media/forms.py
+++ b/media/forms.py
@@ -8,6 +8,7 @@ import unicodedata
 import urllib.request

 from django.forms import ModelForm
+from django.utils.translation import gettext_lazy as _

 from .models import Auteur
 from .scraper import BedetequeScraper
@@ -35,6 +36,51 @@ class MediaAdminForm(ModelForm):
        self.cleaned_data.update(data)
        return True

+    def download_data_google(self, isbn):
+        """
+        Download data from google books
+        :return True if success
+        """
+        api_url = "https://www.googleapis.com/books/v1/volumes?q=isbn:{}".format(isbn)
+        with urllib.request.urlopen(api_url) as url:
+            data = json.loads(url.read().decode())
+
+        if data and data['totalItems']:
+            data = data['items'][0]
+            # Fill the data
+            self.parse_data_google(data)
+            return True
+        return False
+
+    def parse_data_google(self, data):
+        print(data)
+        info = data['volumeInfo']
+        self.cleaned_data['external_url'] = info['canonicalVolumeLink']
+        if 'title' in info:
+            self.cleaned_data['title'] = info['title']
+        if 'subtitle' in data:
+            self.cleaned_data['subtitle'] = info['subtitle']
+
+        if 'pageCount' in info:
+            self.cleaned_data['number_of_pages'] = \
+                info['pageCount']
+        elif not self.cleaned_data['number_of_pages']:
+            self.cleaned_data['number_of_pages'] = 0
+
+        if 'publishedDate' in info:
+            self.cleaned_data['publish_date'] = info['publishedDate']
+
+        if 'authors' not in self.cleaned_data:
+            self.cleaned_data['authors'] = list()
+
+        if 'authors' in info:
+            for author in info['authors']:
+                author_obj = Auteur.objects.get_or_create(
+                    name=author)[0]
+                self.cleaned_data['authors'].append(author_obj)
+
+        print(self.cleaned_data)
+
    def download_data_openlibrary(self, isbn):
        """
        Download data from openlibrary
@@ -54,7 +100,6 @@ class MediaAdminForm(ModelForm):
        return False

    def parse_data_openlibrary(self, data):
-        print(data)
        self.cleaned_data['external_url'] = data['url']
        if 'title' in data:
            self.cleaned_data['title'] = data['title']
@@ -89,18 +134,19 @@ class MediaAdminForm(ModelForm):
                    Nov="11",
                    Dec="12",
                )
-                if split[0] in month_to_number:
+                if split[0][:3] in month_to_number:
                    self.cleaned_data['publish_date']\
                        = split[2] + "-" \
-                        + month_to_number[split[0]] + "-" + split[1]
+                        + month_to_number[split[0][:3]] + "-" + split[1]
                else:
                    self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}" \
                        .format(split[2], months.index(split[0])
                                + 1, int(split[1]), )

+        if 'authors' not in self.cleaned_data:
+            self.cleaned_data['authors'] = list()
+
        if 'authors' in data:
-            if 'author' not in self.cleaned_data:
-                self.cleaned_data['authors'] = list()
            for author in data['authors']:
                author_obj = Auteur.objects.get_or_create(
                    name=author['name'])[0]
@@ -122,8 +168,13 @@ class MediaAdminForm(ModelForm):
                # ISBN is present, try with bedeteque
                scrap_result = self.download_data_bedeteque(isbn)
                if not scrap_result:
-                    # Try with OpenLibrary
-                    self.download_data_openlibrary(isbn)
+                    # Try with Fnac
+                    scrap_result = self.download_data_google(isbn)
+                    if not scrap_result:
+                        # Try with OpenLibrary
+                        if not self.download_data_openlibrary(isbn):
+                            self.add_error('isbn', _("This ISBN is not found."))
+                            return self.cleaned_data

                if self.cleaned_data['title']:
                    self.cleaned_data['title'] = re.sub(
@@ -183,6 +234,8 @@ class MediaAdminForm(ModelForm):
                    ).casefold().upper()
                    self.cleaned_data['side_identifier'] = side_identifier

+        print(self.cleaned_data)
+
        return self.cleaned_data

    def _clean_fields(self):
@@ -198,7 +251,7 @@ class MediaAdminForm(ModelForm):
            from django.core.exceptions import ValidationError
            try:
                # We don't want to check a field when we enter an ISBN.
-                if "_isbn" not in self.request.POST \
+                if "isbn" not in self.request.POST \
                        or not self.cleaned_data.get('isbn'):
                    value = field.clean(value)
                self.cleaned_data[name] = value