mirror of
https://gitlab.crans.org/mediatek/med.git
synced 2024-11-26 21:27:11 +00:00
Scrap author and illustrator (may not work for some books)
This commit is contained in:
parent
552d2b8f0e
commit
6cd7f883b9
@ -56,6 +56,8 @@ class BedetequeScraper:
|
||||
regex_subtitle = r'<h2>\s*(.*)</h2>'
|
||||
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
|
||||
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
|
||||
regex_author = r'<span itemprop=\"author\">(((?!<).)*)</span>'
|
||||
regex_illustrator = r'<span itemprop=\"illustrator\">(((?!<).)*)</span>'
|
||||
|
||||
data = {
|
||||
'external_url': bd_url,
|
||||
@ -87,4 +89,16 @@ class BedetequeScraper:
|
||||
if search_nb_pages and search_nb_pages.group(1).isnumeric():
|
||||
data['number_of_pages'] = search_nb_pages.group(1)
|
||||
|
||||
# Get author and illustrator
|
||||
author = re.search(regex_author, content)
|
||||
if not 'author' in data:
|
||||
data['author'] = list()
|
||||
if author:
|
||||
data['author'].append(author.group(1))
|
||||
illustrator = re.search(regex_illustrator, content)
|
||||
if illustrator:
|
||||
data['author'].append(illustrator.group(1))
|
||||
|
||||
print(data)
|
||||
|
||||
return data
|
||||
|
Loading…
Reference in New Issue
Block a user