Add option to update dataset silently

This commit is contained in:
Emmy D'Anello 2023-02-13 13:12:39 +01:00
parent 4f326626bf
commit 9fbf4fb172
Signed by: ynerant
GPG Key ID: 3A75C55819C8CF85
1 changed files with 36 additions and 31 deletions

67
app.py
View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from contextlib import nullcontext
import csv import csv
from datetime import date, datetime, time, timedelta from datetime import date, datetime, time, timedelta
import os import os
@ -59,54 +60,58 @@ class RouteQueue(db.Model):
@cli.command("update-dataset") @cli.command("update-dataset")
def update_dataset(): @click.option('--verbose', '-v', is_flag=True, help="Display errors.")
def update_dataset(verbose: bool = False):
""" """
Query the latest version of the SNCF OpenData dataset, as a CSV file. Query the latest version of the SNCF OpenData dataset, as a CSV file.
""" """
try: resp = requests.get('https://ressources.data.sncf.com/explore/dataset/tgvmax/information/')
resp = requests.get('https://ressources.data.sncf.com/explore/dataset/tgvmax/information/') content = resp.content.decode().split('<script type="application/ld+json">')[1].split('</script>')[0].strip()
content = resp.content.decode().split('<script type="application/ld+json">')[1].split('</script>')[0].strip() content = content.replace('\r', '')
content = content.replace('\r', '') content = content.replace('" \n', '" \\n')
content = content.replace('" \n', '" \\n') content = content.replace('.\n', '.\\n')
content = content.replace('.\n', '.\\n') content = content.replace('\n\n \nLa', '\\n\\n \\nLa')
content = content.replace('\n\n \nLa', '\\n\\n \\nLa') content = content.replace('\n"', '\\n"')
content = content.replace('\n"', '\\n"')
info = json.loads(content) info = json.loads(content)
modified_date = datetime.fromisoformat(info['dateModified']) modified_date = datetime.fromisoformat(info['dateModified'])
utc = timezone('UTC') utc = timezone('UTC')
last_modified = datetime.utcfromtimestamp(os.path.getmtime('tgvmax.csv')).replace(tzinfo=utc) if os.path.isfile( last_modified = datetime.utcfromtimestamp(os.path.getmtime('tgvmax.csv')).replace(tzinfo=utc) if os.path.isfile(
'tgvmax.csv') else datetime(1, 1, 1, tzinfo=utc) 'tgvmax.csv') else datetime(1, 1, 1, tzinfo=utc)
if last_modified < modified_date: if last_modified < modified_date:
if verbose:
print("Updating tgvmax.csv…") print("Updating tgvmax.csv…")
with requests.get(info['distribution'][0]['contentUrl'], stream=True) as resp: with requests.get(info['distribution'][0]['contentUrl'], stream=True) as resp:
resp.raise_for_status() resp.raise_for_status()
with open('tgvmax.csv', 'wb') as f: with open('tgvmax.csv', 'wb') as f:
with tqdm(unit='io', unit_scale=True) as t: with tqdm(unit='io', unit_scale=True) if verbose else nullcontext() as t:
for chunk in resp.iter_content(chunk_size=512 * 1024): for chunk in resp.iter_content(chunk_size=512 * 1024):
if chunk: if chunk:
f.write(chunk) f.write(chunk)
if verbose:
t.update(len(chunk)) t.update(len(chunk))
os.utime('tgvmax.csv', (modified_date.timestamp(), modified_date.timestamp())) os.utime('tgvmax.csv', (modified_date.timestamp(), modified_date.timestamp()))
if verbose:
print("Done") print("Done")
else:
print("Last modification:", modified_date) if verbose:
except Exception as e: print("Last modification:", modified_date)
print("An error occured while updating tgvmax.csv") exit(2)
print(e)
@cli.command("parse-csv") @cli.command("parse-csv")
@click.option('-F', '--flush', type=bool, is_flag=True, help="Flush the database before filling it.") @click.option('-F', '--flush', type=bool, is_flag=True, help="Flush the database before filling it.")
def parse_trains(flush: bool = False): @click.option('--verbose', '-v', is_flag=True, help="Display errors.")
def parse_trains(flush: bool = False, verbose: bool = False):
""" """
Parse the CSV file and store it to the database. Parse the CSV file and store it to the database.
""" """
if flush: if flush:
print("Flush database…") if verbose:
print("Flush database…")
db.session.query(Train).delete() db.session.query(Train).delete()
last_modification = datetime.utcfromtimestamp(os.path.getmtime('tgvmax.csv')).replace(tzinfo=timezone('UTC')) last_modification = datetime.utcfromtimestamp(os.path.getmtime('tgvmax.csv')).replace(tzinfo=timezone('UTC'))
@ -114,7 +119,7 @@ def parse_trains(flush: bool = False):
with open('tgvmax.csv') as f: with open('tgvmax.csv') as f:
first_line = True first_line = True
already_seen = set() already_seen = set()
for line in tqdm(csv.reader(f, delimiter=';')): for line in (tqdm if verbose else lambda x: x)(csv.reader(f, delimiter=';')):
if first_line: if first_line:
first_line = False first_line = False
continue continue