Optimize CSV processing

This commit is contained in:
Emmy D'Anello 2024-05-11 23:18:57 +02:00
parent 368f07da32
commit 86d274ac84
Signed by: ynerant
GPG Key ID: 3A75C55819C8CF85
1 changed files with 20 additions and 18 deletions

View File

@ -1,6 +1,6 @@
import csv import csv
from datetime import datetime, timedelta from datetime import datetime, timedelta
from io import BytesIO from io import BytesIO, TextIOWrapper
from zipfile import ZipFile from zipfile import ZipFile
from zoneinfo import ZoneInfo from zoneinfo import ZoneInfo
@ -8,8 +8,8 @@ import requests
from django.core.management import BaseCommand from django.core.management import BaseCommand
from tqdm import tqdm from tqdm import tqdm
from trainvel.gtfs.models import Agency, Calendar, CalendarDate, FeedInfo, GTFSFeed, Route, Stop, StopTime, Transfer, Trip, \ from trainvel.gtfs.models import Agency, Calendar, CalendarDate, FeedInfo, GTFSFeed, Route, Stop, StopTime, \
PickupType Transfer, Trip, PickupType
class Command(BaseCommand): class Command(BaseCommand):
@ -52,16 +52,18 @@ class Command(BaseCommand):
self.stdout.write(f"Downloading GTFS feed for {gtfs_feed}...") self.stdout.write(f"Downloading GTFS feed for {gtfs_feed}...")
resp = requests.get(gtfs_feed.feed_url, allow_redirects=True, stream=True) resp = requests.get(gtfs_feed.feed_url, allow_redirects=True, stream=True)
with ZipFile(BytesIO(resp.content)) as zipfile: with ZipFile(BytesIO(resp.content)) as zipfile:
def read_file(filename): def read_csv(filename):
lines = zipfile.read(filename).decode().replace('\ufeff', '').splitlines() with zipfile.open(filename, 'r') as zf:
return [line.strip() for line in lines] with TextIOWrapper(zf, encoding='utf-8') as wrapper:
reader = csv.DictReader(wrapper)
reader.fieldnames = [field.replace('\ufeff', '').strip()
for field in reader.fieldnames]
for row in tqdm(reader, desc=filename, unit=' rows'):
yield {k.strip(): v.strip() for k, v in row.items()}
agencies = [] agencies = []
for agency_dict in csv.DictReader(read_file("agency.txt")): for agency_dict in read_csv("agency.txt"):
agency_dict: dict agency_dict: dict
# if gtfs_code == "FR-EUROSTAR" \
# and agency_dict['agency_id'] != 'ES' and agency_dict['agency_id'] != 'ER':
# continue
agency = Agency( agency = Agency(
id=f"{gtfs_code}-{agency_dict['agency_id']}", id=f"{gtfs_code}-{agency_dict['agency_id']}",
name=agency_dict['agency_name'], name=agency_dict['agency_name'],
@ -82,7 +84,7 @@ class Command(BaseCommand):
agencies.clear() agencies.clear()
stops = [] stops = []
for stop_dict in csv.DictReader(tqdm(read_file("stops.txt"), desc="Stops")): for stop_dict in read_csv("stops.txt"):
stop_dict: dict stop_dict: dict
stop_id = stop_dict['stop_id'] stop_id = stop_dict['stop_id']
stop_id = f"{gtfs_code}-{stop_id}" stop_id = f"{gtfs_code}-{stop_id}"
@ -120,7 +122,7 @@ class Command(BaseCommand):
stops.clear() stops.clear()
routes = [] routes = []
for route_dict in csv.DictReader(tqdm(read_file("routes.txt"), desc="Routes")): for route_dict in read_csv("routes.txt"):
route_dict: dict route_dict: dict
route_id = route_dict['route_id'] route_id = route_dict['route_id']
route_id = f"{gtfs_code}-{route_id}" route_id = f"{gtfs_code}-{route_id}"
@ -160,7 +162,7 @@ class Command(BaseCommand):
Calendar.objects.filter(gtfs_feed=gtfs_feed).delete() Calendar.objects.filter(gtfs_feed=gtfs_feed).delete()
calendars = {} calendars = {}
if "calendar.txt" in zipfile.namelist(): if "calendar.txt" in zipfile.namelist():
for calendar_dict in csv.DictReader(tqdm(read_file("calendar.txt"), desc="Calendars")): for calendar_dict in read_csv("calendar.txt"):
calendar_dict: dict calendar_dict: dict
calendar = Calendar( calendar = Calendar(
id=f"{gtfs_code}-{calendar_dict['service_id']}", id=f"{gtfs_code}-{calendar_dict['service_id']}",
@ -194,7 +196,7 @@ class Command(BaseCommand):
calendars.clear() calendars.clear()
calendar_dates = [] calendar_dates = []
for calendar_date_dict in csv.DictReader(tqdm(read_file("calendar_dates.txt"), desc="Calendar dates")): for calendar_date_dict in read_csv("calendar_dates.txt"):
calendar_date_dict: dict calendar_date_dict: dict
calendar_date = CalendarDate( calendar_date = CalendarDate(
id=f"{gtfs_code}-{calendar_date_dict['service_id']}-{calendar_date_dict['date']}", id=f"{gtfs_code}-{calendar_date_dict['service_id']}-{calendar_date_dict['date']}",
@ -241,7 +243,7 @@ class Command(BaseCommand):
calendar_dates.clear() calendar_dates.clear()
trips = [] trips = []
for trip_dict in csv.DictReader(tqdm(read_file("trips.txt"), desc="Trips")): for trip_dict in read_csv("trips.txt"):
trip_dict: dict trip_dict: dict
trip_id = trip_dict['trip_id'] trip_id = trip_dict['trip_id']
route_id = trip_dict['route_id'] route_id = trip_dict['route_id']
@ -280,7 +282,7 @@ class Command(BaseCommand):
trips.clear() trips.clear()
stop_times = [] stop_times = []
for stop_time_dict in csv.DictReader(tqdm(read_file("stop_times.txt"), desc="Stop times")): for stop_time_dict in read_csv("stop_times.txt"):
stop_time_dict: dict stop_time_dict: dict
stop_id = stop_time_dict['stop_id'] stop_id = stop_time_dict['stop_id']
@ -339,7 +341,7 @@ class Command(BaseCommand):
if "transfers.txt" in zipfile.namelist(): if "transfers.txt" in zipfile.namelist():
transfers = [] transfers = []
for transfer_dict in csv.DictReader(tqdm(read_file("transfers.txt"), desc="Transfers")): for transfer_dict in read_csv("transfers.txt"):
transfer_dict: dict transfer_dict: dict
from_stop_id = transfer_dict['from_stop_id'] from_stop_id = transfer_dict['from_stop_id']
to_stop_id = transfer_dict['to_stop_id'] to_stop_id = transfer_dict['to_stop_id']
@ -370,7 +372,7 @@ class Command(BaseCommand):
transfers.clear() transfers.clear()
if "feed_info.txt" in zipfile.namelist() and not dry_run: if "feed_info.txt" in zipfile.namelist() and not dry_run:
for feed_info_dict in csv.DictReader(tqdm(read_file("feed_info.txt"), desc="Feed info")): for feed_info_dict in read_csv("feed_info.txt"):
feed_info_dict: dict feed_info_dict: dict
FeedInfo.objects.update_or_create( FeedInfo.objects.update_or_create(
publisher_name=feed_info_dict['feed_publisher_name'], publisher_name=feed_info_dict['feed_publisher_name'],