1
0
mirror of https://gitlab.crans.org/mediatek/med.git synced 2024-11-26 21:27:11 +00:00

Scrap author and illustrator (may not work for some books)

This commit is contained in:
Yohann D'ANELLO 2020-02-10 03:58:52 +01:00
parent 552d2b8f0e
commit 6cd7f883b9

View File

@ -56,6 +56,8 @@ class BedetequeScraper:
regex_subtitle = r'<h2>\s*(.*)</h2>' regex_subtitle = r'<h2>\s*(.*)</h2>'
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">' regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
regex_nb_of_pages = r'numberOfPages\">(\d*)</span' regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
regex_author = r'<span itemprop=\"author\">(((?!<).)*)</span>'
regex_illustrator = r'<span itemprop=\"illustrator\">(((?!<).)*)</span>'
data = { data = {
'external_url': bd_url, 'external_url': bd_url,
@ -87,4 +89,16 @@ class BedetequeScraper:
if search_nb_pages and search_nb_pages.group(1).isnumeric(): if search_nb_pages and search_nb_pages.group(1).isnumeric():
data['number_of_pages'] = search_nb_pages.group(1) data['number_of_pages'] = search_nb_pages.group(1)
# Get author and illustrator
author = re.search(regex_author, content)
if not 'author' in data:
data['author'] = list()
if author:
data['author'].append(author.group(1))
illustrator = re.search(regex_illustrator, content)
if illustrator:
data['author'].append(illustrator.group(1))
print(data)
return data return data