1
0
mirror of https://gitlab.crans.org/mediatek/med.git synced 2024-12-25 17:42:25 +00:00

Split comic strips and mangas

This commit is contained in:
Yohann D'ANELLO 2020-05-21 16:56:41 +02:00
parent 1657f5c42c
commit 82efeba272
10 changed files with 202 additions and 12 deletions

View File

@ -13,6 +13,8 @@ sleep 2
python manage.py migrate
python manage.py collectstatic --no-input
python manage.py runserver 0.0.0.0:8000
# harakiri parameter respawns processes taking more than 20 seconds
# max-requests parameter respawns processes after serving 5000 requests
# vacuum parameter cleans up when stopped

View File

@ -9,7 +9,7 @@ from reversion.admin import VersionAdmin
from med.admin import admin_site
from .forms import MediaAdminForm
from .models import Auteur, Emprunt, FutureMedia, Jeu, Media
from .models import Auteur, Emprunt, FutureMedia, Jeu, Media, Manga
class AuteurAdmin(VersionAdmin):
@ -121,6 +121,7 @@ class JeuAdmin(VersionAdmin):
admin_site.register(Auteur, AuteurAdmin)
admin_site.register(Media, MediaAdmin)
admin_site.register(Manga, MediaAdmin)
admin_site.register(FutureMedia, FutureMediaAdmin)
admin_site.register(Emprunt, EmpruntAdmin)
admin_site.register(Jeu, JeuAdmin)

View File

@ -10,7 +10,7 @@ import urllib.request
from django.forms import ModelForm
from django.utils.translation import gettext_lazy as _
from .models import Auteur
from .models import Auteur, Media
from .scraper import BedetequeScraper
@ -54,7 +54,6 @@ class MediaAdminForm(ModelForm):
return False
def parse_data_google(self, data):
print(data)
info = data['volumeInfo']
self.cleaned_data['external_url'] = info['canonicalVolumeLink']
if 'title' in info:
@ -71,7 +70,7 @@ class MediaAdminForm(ModelForm):
if 'publishedDate' in info:
self.cleaned_data['publish_date'] = info['publishedDate']
if 'authors' not in self.cleaned_data:
if 'authors' not in self.cleaned_data or not self.cleaned_data['authors']:
self.cleaned_data['authors'] = list()
if 'authors' in info:
@ -144,7 +143,7 @@ class MediaAdminForm(ModelForm):
.format(split[2], months.index(split[0])
+ 1, int(split[1]), )
if 'authors' not in self.cleaned_data:
if 'authors' not in self.cleaned_data or not self.cleaned_data['authors']:
self.cleaned_data['authors'] = list()
if 'authors' in data:
@ -159,17 +158,17 @@ class MediaAdminForm(ModelForm):
"""
super().clean()
if "_isbn" in self.request.POST\
or "_isbn_addanother" in self.request.POST:
if "_isbn" in self.data\
or "_isbn_addanother" in self.data:
isbn = self.cleaned_data.get('isbn')
if "_isbn_addanother" in self.request.POST:
self.request.POST = self.request.POST.copy()
self.request.POST['_addanother'] = 42
if "_isbn_addanother" in self.data:
self.data = self.data.copy()
self.data['_addanother'] = 42
if isbn:
# ISBN is present, try with bedeteque
scrap_result = self.download_data_bedeteque(isbn)
if not scrap_result:
# Try with Fnac
# Try with Google
scrap_result = self.download_data_google(isbn)
if not scrap_result:
# Try with OpenLibrary
@ -251,7 +250,7 @@ class MediaAdminForm(ModelForm):
from django.core.exceptions import ValidationError
try:
# We don't want to check a field when we enter an ISBN.
if "isbn" not in self.request.POST \
if "isbn" not in self.data \
or not self.cleaned_data.get('isbn'):
value = field.clean(value)
self.cleaned_data[name] = value
@ -260,3 +259,7 @@ class MediaAdminForm(ModelForm):
self.cleaned_data[name] = value
except ValidationError as e:
self.add_error(name, e)
class Meta:
model = Media
fields = '__all__'

View File

View File

View File

@ -0,0 +1,50 @@
from django.core.management import BaseCommand
from media.forms import MediaAdminForm
from media.models import Media, Manga
class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--view-only', action="store_true",
help="Display only modifications. "
+ "Only useful for debug.")
def handle(self, *args, **options):
converted = 0
for media in Media.objects.all():
self.stdout.write(str(media))
form = MediaAdminForm(instance=media, data={"isbn": media.isbn, "_isbn": True, })
form.full_clean()
if not "format" in form.cleaned_data:
self.stdout.write("Format not specified. Assume it is a comic strip.")
continue
format = form.cleaned_data["format"]
self.stdout.write("Format: {}".format(format))
if not options["view_only"]:
if format == "manga":
self.stdout.write(self.style.WARNING("This media is a manga. Transfer it into a new object..."))
manga = Manga.objects.create(
isbn=media.isbn,
title=media.title,
subtitle=media.subtitle,
external_url=media.external_url,
side_identifier=media.side_identifier,
number_of_pages=media.number_of_pages,
publish_date=media.publish_date,
)
manga.authors.set(media.authors.all())
manga.save()
self.stdout.write(self.style.SUCCESS("Manga successfully saved. Deleting old medium..."))
media.delete()
self.stdout.write(self.style.SUCCESS("Medium deleted"))
converted += 1
self.stdout.write(self.style.SUCCESS("Successfully saved {:d} mangas".format(converted)))

View File

@ -0,0 +1,26 @@
# Generated by Django 2.2.10 on 2020-05-12 15:23
from django.db import migrations, models
import media.fields
import media.validators
class Migration(migrations.Migration):
dependencies = [
('media', '0026_auto_20200210_1740'),
]
operations = [
migrations.CreateModel(
name='FutureMedia',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('isbn', media.fields.ISBNField(blank=True, help_text='You may be able to scan it from a bar code.', max_length=28, null=True, unique=True, validators=[media.validators.isbn_validator], verbose_name='ISBN')),
],
options={
'verbose_name': 'future medium',
'verbose_name_plural': 'future media',
},
),
]

View File

@ -0,0 +1,34 @@
# Generated by Django 2.2.10 on 2020-05-21 14:28
from django.db import migrations, models
import media.fields
import media.validators
class Migration(migrations.Migration):
dependencies = [
('media', '0027_futuremedia'),
]
operations = [
migrations.CreateModel(
name='Manga',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('isbn', media.fields.ISBNField(blank=True, help_text='You may be able to scan it from a bar code.', max_length=28, null=True, unique=True, validators=[media.validators.isbn_validator], verbose_name='ISBN')),
('title', models.CharField(max_length=255, verbose_name='title')),
('subtitle', models.CharField(blank=True, max_length=255, null=True, verbose_name='subtitle')),
('external_url', models.URLField(blank=True, null=True, verbose_name='external URL')),
('side_identifier', models.CharField(max_length=255, verbose_name='side identifier')),
('number_of_pages', models.PositiveIntegerField(blank=True, null=True, verbose_name='number of pages')),
('publish_date', models.DateField(blank=True, null=True, verbose_name='publish date')),
('authors', models.ManyToManyField(to='media.Auteur', verbose_name='authors')),
],
options={
'verbose_name': 'medium',
'verbose_name_plural': 'media',
'ordering': ['title', 'subtitle'],
},
),
]

View File

@ -38,34 +38,41 @@ class Media(models.Model):
blank=True,
null=True,
)
title = models.CharField(
verbose_name=_('title'),
max_length=255,
)
subtitle = models.CharField(
verbose_name=_('subtitle'),
max_length=255,
blank=True,
null=True,
)
external_url = models.URLField(
verbose_name=_('external URL'),
blank=True,
null=True,
)
side_identifier = models.CharField(
verbose_name=_('side identifier'),
max_length=255,
)
authors = models.ManyToManyField(
'Auteur',
verbose_name=_('authors'),
)
number_of_pages = models.PositiveIntegerField(
verbose_name=_('number of pages'),
blank=True,
null=True,
)
publish_date = models.DateField(
verbose_name=_('publish date'),
blank=True,
@ -84,6 +91,67 @@ class Media(models.Model):
ordering = ['title', 'subtitle']
class Manga(models.Model):
isbn = ISBNField(
_('ISBN'),
help_text=_('You may be able to scan it from a bar code.'),
unique=True,
blank=True,
null=True,
)
title = models.CharField(
verbose_name=_('title'),
max_length=255,
)
subtitle = models.CharField(
verbose_name=_('subtitle'),
max_length=255,
blank=True,
null=True,
)
external_url = models.URLField(
verbose_name=_('external URL'),
blank=True,
null=True,
)
side_identifier = models.CharField(
verbose_name=_('side identifier'),
max_length=255,
)
authors = models.ManyToManyField(
'Auteur',
verbose_name=_('authors'),
)
number_of_pages = models.PositiveIntegerField(
verbose_name=_('number of pages'),
blank=True,
null=True,
)
publish_date = models.DateField(
verbose_name=_('publish date'),
blank=True,
null=True,
)
def __str__(self):
if self.subtitle:
return "{} : {}".format(self.title, self.subtitle)
else:
return self.title
class Meta:
verbose_name = _("manga")
verbose_name_plural = _("mangas")
ordering = ['title', 'subtitle']
class FutureMedia(models.Model):
isbn = ISBNField(
_('ISBN'),

View File

@ -58,6 +58,7 @@ class BedetequeScraper:
regex_subtitle = r'<h2>\s*(.*)</h2>'
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
regex_format = r'<label>Format : </label>Format (\w+)</li>'
regex_author = r'<span itemprop=\"author\">(((?!<).)*)</span>'
regex_illustrator = r'span itemprop=\"illustrator\">(((?!<).)*)</span'
@ -89,6 +90,11 @@ class BedetequeScraper:
elif 'number_of_pages' not in data:
data['number_of_pages'] = 0
# Get format of the book
search_format = re.search(regex_format, content)
if search_format:
data['format'] = search_format.group(1).lower()
# Get author and illustrator
author = re.search(regex_author, content)
if 'author' not in data: