Parse CSV to database

This commit is contained in:
Emmy D'Anello 2023-02-12 17:22:40 +01:00
parent e22a039aa8
commit 39fdfce38f
Signed by: ynerant
GPG Key ID: 3A75C55819C8CF85
1 changed files with 63 additions and 27 deletions

90
app.py
View File

@ -7,10 +7,12 @@ import json
from pytz import timezone from pytz import timezone
import requests import requests
import click
from flask import Flask from flask import Flask
from flask.cli import AppGroup
from flask_migrate import Migrate from flask_migrate import Migrate
from flask_sqlalchemy import SQLAlchemy from flask_sqlalchemy import SQLAlchemy
from sqlalchemy import Boolean, Column, Date, Integer, String, Time from sqlalchemy import Boolean, Column, Date, DateTime, Integer, String, Time
from tqdm import tqdm from tqdm import tqdm
import config import config
@ -18,6 +20,10 @@ import config
app = Flask(__name__) app = Flask(__name__)
cli = AppGroup('tgvmax', help="Manage the TGVMax dataset.")
app.cli.add_command(cli)
app.config |= config.FLASK_CONFIG app.config |= config.FLASK_CONFIG
db = SQLAlchemy(app) db = SQLAlchemy(app)
@ -29,19 +35,25 @@ class Train(db.Model):
id = Column(String, primary_key=True) id = Column(String, primary_key=True)
day = Column(Date, index=True) day = Column(Date, index=True)
number = Column(Integer, index=True) number = Column(Integer, index=True)
entity = Column(String(255)) entity = Column(String(10))
axe = Column(String(255), index=True) axe = Column(String(32), index=True)
orig_iata = Column(String(5), index=True) orig_iata = Column(String(5), index=True)
dest_iata = Column(String(5), index=True) dest_iata = Column(String(5), index=True)
orig = Column(String(255)) orig = Column(String(32))
dest = Column(String(255)) dest = Column(String(32))
dep = Column(String(255)) dep = Column(Time)
arr = Column(Time) arr = Column(Time)
tgvmax = Column(Boolean, index=True) tgvmax = Column(Boolean, index=True)
remaining_seats = Column(Integer) remaining_seats = Column(Integer, default=-1)
last_modification = Column(DateTime)
expiration_time = Column(DateTime)
@cli.command("update-dataset")
def update_dataset(): def update_dataset():
"""
Query the latest version of the SNCF OpenData dataset, as a CSV file.
"""
try: try:
resp = requests.get('https://ressources.data.sncf.com/explore/dataset/tgvmax/information/') resp = requests.get('https://ressources.data.sncf.com/explore/dataset/tgvmax/information/')
content = resp.content.decode().split('<script type="application/ld+json">')[1].split('</script>')[0].strip() content = resp.content.decode().split('<script type="application/ld+json">')[1].split('</script>')[0].strip()
@ -76,37 +88,61 @@ def update_dataset():
print(e) print(e)
def parse_trains(*, filter_day: date | None = None, @cli.command("parse-csv")
filter_number: int | None = None, @click.option('-F', '--flush', type=bool, is_flag=True, help="Flush the database before filling it.")
filter_tgvmax: bool | None = None): def parse_trains(flush: bool = False):
trains = [] """
Parse the CSV file and store it to the database.
"""
if flush:
print("Flush database…")
db.session.query(Train).delete()
last_modification = datetime.utcfromtimestamp(os.path.getmtime('tgvmax.csv')).replace(tzinfo=timezone('UTC'))
with open('tgvmax.csv') as f: with open('tgvmax.csv') as f:
first_line = True first_line = True
for line in csv.reader(f, delimiter=';'): already_seen = set()
for line in tqdm(csv.reader(f, delimiter=';')):
if first_line: if first_line:
first_line = False first_line = False
continue continue
train = Train(*line) train_id = f"{line[1]}-{line[0]}-{line[4]}-{line[5]}"
train.day = date.fromisoformat(train.day) if train_id in already_seen:
train.number = int(train.number) # Some trains are mysteriously duplicated, concerns only some « Intercités de nuit »
train.dep = time.fromisoformat(train.dep) # and the Brive-la-Gaillarde -- Paris
train.arr = time.fromisoformat(train.arr) # and, maybe, for Roubaix-Tourcoing
train.tgvmax = train.tgvmax == 'OUI' if line[3] != "IC NUIT" and line[1] != '3614' and not (line[4] == 'FRADP' and line[5] == 'FRADM'):
print("Duplicate:", train_id)
if filter_day is not None and train.day != filter_day:
continue continue
if filter_number is not None and train.number != filter_number: train = Train(
continue id=train_id,
day=date.fromisoformat(line[0]),
number=int(line[1]),
entity=line[2],
axe=line[3],
orig_iata=line[4],
dest_iata=line[5],
orig=line[6],
dest=line[7],
dep=time.fromisoformat(line[8]),
arr=time.fromisoformat(line[9]),
tgvmax=line[10] == 'OUI',
last_modification=last_modification,
expiration_time=last_modification,
)
if flush:
db.session.add(train)
else:
db.session.merge(train)
if filter_tgvmax is not None and train.tgvmax != filter_tgvmax: if line[3] == "IC NUIT" or line[1] == '3614' or (line[4] == 'FRADP' and line[5] == 'FRADM'):
continue already_seen.add(train_id)
trains.append(train) db.session.commit()
return trains
def find_routes(day, orig, dest): def find_routes(day, orig, dest):