mirror of
https://gitlab.crans.org/mediatek/med.git
synced 2024-11-26 21:27:11 +00:00
Scrap author and illustrator (may not work for some books)
This commit is contained in:
parent
552d2b8f0e
commit
6cd7f883b9
@ -56,6 +56,8 @@ class BedetequeScraper:
|
|||||||
regex_subtitle = r'<h2>\s*(.*)</h2>'
|
regex_subtitle = r'<h2>\s*(.*)</h2>'
|
||||||
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
|
regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
|
||||||
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
|
regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
|
||||||
|
regex_author = r'<span itemprop=\"author\">(((?!<).)*)</span>'
|
||||||
|
regex_illustrator = r'<span itemprop=\"illustrator\">(((?!<).)*)</span>'
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
'external_url': bd_url,
|
'external_url': bd_url,
|
||||||
@ -87,4 +89,16 @@ class BedetequeScraper:
|
|||||||
if search_nb_pages and search_nb_pages.group(1).isnumeric():
|
if search_nb_pages and search_nb_pages.group(1).isnumeric():
|
||||||
data['number_of_pages'] = search_nb_pages.group(1)
|
data['number_of_pages'] = search_nb_pages.group(1)
|
||||||
|
|
||||||
|
# Get author and illustrator
|
||||||
|
author = re.search(regex_author, content)
|
||||||
|
if not 'author' in data:
|
||||||
|
data['author'] = list()
|
||||||
|
if author:
|
||||||
|
data['author'].append(author.group(1))
|
||||||
|
illustrator = re.search(regex_illustrator, content)
|
||||||
|
if illustrator:
|
||||||
|
data['author'].append(illustrator.group(1))
|
||||||
|
|
||||||
|
print(data)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
Loading…
Reference in New Issue
Block a user