Optimize CSV processing
This commit is contained in:
parent
368f07da32
commit
86d274ac84
|
@ -1,6 +1,6 @@
|
||||||
import csv
|
import csv
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from io import BytesIO
|
from io import BytesIO, TextIOWrapper
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
from zoneinfo import ZoneInfo
|
from zoneinfo import ZoneInfo
|
||||||
|
|
||||||
|
@ -8,8 +8,8 @@ import requests
|
||||||
from django.core.management import BaseCommand
|
from django.core.management import BaseCommand
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from trainvel.gtfs.models import Agency, Calendar, CalendarDate, FeedInfo, GTFSFeed, Route, Stop, StopTime, Transfer, Trip, \
|
from trainvel.gtfs.models import Agency, Calendar, CalendarDate, FeedInfo, GTFSFeed, Route, Stop, StopTime, \
|
||||||
PickupType
|
Transfer, Trip, PickupType
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
|
@ -52,16 +52,18 @@ class Command(BaseCommand):
|
||||||
self.stdout.write(f"Downloading GTFS feed for {gtfs_feed}...")
|
self.stdout.write(f"Downloading GTFS feed for {gtfs_feed}...")
|
||||||
resp = requests.get(gtfs_feed.feed_url, allow_redirects=True, stream=True)
|
resp = requests.get(gtfs_feed.feed_url, allow_redirects=True, stream=True)
|
||||||
with ZipFile(BytesIO(resp.content)) as zipfile:
|
with ZipFile(BytesIO(resp.content)) as zipfile:
|
||||||
def read_file(filename):
|
def read_csv(filename):
|
||||||
lines = zipfile.read(filename).decode().replace('\ufeff', '').splitlines()
|
with zipfile.open(filename, 'r') as zf:
|
||||||
return [line.strip() for line in lines]
|
with TextIOWrapper(zf, encoding='utf-8') as wrapper:
|
||||||
|
reader = csv.DictReader(wrapper)
|
||||||
|
reader.fieldnames = [field.replace('\ufeff', '').strip()
|
||||||
|
for field in reader.fieldnames]
|
||||||
|
for row in tqdm(reader, desc=filename, unit=' rows'):
|
||||||
|
yield {k.strip(): v.strip() for k, v in row.items()}
|
||||||
|
|
||||||
agencies = []
|
agencies = []
|
||||||
for agency_dict in csv.DictReader(read_file("agency.txt")):
|
for agency_dict in read_csv("agency.txt"):
|
||||||
agency_dict: dict
|
agency_dict: dict
|
||||||
# if gtfs_code == "FR-EUROSTAR" \
|
|
||||||
# and agency_dict['agency_id'] != 'ES' and agency_dict['agency_id'] != 'ER':
|
|
||||||
# continue
|
|
||||||
agency = Agency(
|
agency = Agency(
|
||||||
id=f"{gtfs_code}-{agency_dict['agency_id']}",
|
id=f"{gtfs_code}-{agency_dict['agency_id']}",
|
||||||
name=agency_dict['agency_name'],
|
name=agency_dict['agency_name'],
|
||||||
|
@ -82,7 +84,7 @@ class Command(BaseCommand):
|
||||||
agencies.clear()
|
agencies.clear()
|
||||||
|
|
||||||
stops = []
|
stops = []
|
||||||
for stop_dict in csv.DictReader(tqdm(read_file("stops.txt"), desc="Stops")):
|
for stop_dict in read_csv("stops.txt"):
|
||||||
stop_dict: dict
|
stop_dict: dict
|
||||||
stop_id = stop_dict['stop_id']
|
stop_id = stop_dict['stop_id']
|
||||||
stop_id = f"{gtfs_code}-{stop_id}"
|
stop_id = f"{gtfs_code}-{stop_id}"
|
||||||
|
@ -120,7 +122,7 @@ class Command(BaseCommand):
|
||||||
stops.clear()
|
stops.clear()
|
||||||
|
|
||||||
routes = []
|
routes = []
|
||||||
for route_dict in csv.DictReader(tqdm(read_file("routes.txt"), desc="Routes")):
|
for route_dict in read_csv("routes.txt"):
|
||||||
route_dict: dict
|
route_dict: dict
|
||||||
route_id = route_dict['route_id']
|
route_id = route_dict['route_id']
|
||||||
route_id = f"{gtfs_code}-{route_id}"
|
route_id = f"{gtfs_code}-{route_id}"
|
||||||
|
@ -160,7 +162,7 @@ class Command(BaseCommand):
|
||||||
Calendar.objects.filter(gtfs_feed=gtfs_feed).delete()
|
Calendar.objects.filter(gtfs_feed=gtfs_feed).delete()
|
||||||
calendars = {}
|
calendars = {}
|
||||||
if "calendar.txt" in zipfile.namelist():
|
if "calendar.txt" in zipfile.namelist():
|
||||||
for calendar_dict in csv.DictReader(tqdm(read_file("calendar.txt"), desc="Calendars")):
|
for calendar_dict in read_csv("calendar.txt"):
|
||||||
calendar_dict: dict
|
calendar_dict: dict
|
||||||
calendar = Calendar(
|
calendar = Calendar(
|
||||||
id=f"{gtfs_code}-{calendar_dict['service_id']}",
|
id=f"{gtfs_code}-{calendar_dict['service_id']}",
|
||||||
|
@ -194,7 +196,7 @@ class Command(BaseCommand):
|
||||||
calendars.clear()
|
calendars.clear()
|
||||||
|
|
||||||
calendar_dates = []
|
calendar_dates = []
|
||||||
for calendar_date_dict in csv.DictReader(tqdm(read_file("calendar_dates.txt"), desc="Calendar dates")):
|
for calendar_date_dict in read_csv("calendar_dates.txt"):
|
||||||
calendar_date_dict: dict
|
calendar_date_dict: dict
|
||||||
calendar_date = CalendarDate(
|
calendar_date = CalendarDate(
|
||||||
id=f"{gtfs_code}-{calendar_date_dict['service_id']}-{calendar_date_dict['date']}",
|
id=f"{gtfs_code}-{calendar_date_dict['service_id']}-{calendar_date_dict['date']}",
|
||||||
|
@ -241,7 +243,7 @@ class Command(BaseCommand):
|
||||||
calendar_dates.clear()
|
calendar_dates.clear()
|
||||||
|
|
||||||
trips = []
|
trips = []
|
||||||
for trip_dict in csv.DictReader(tqdm(read_file("trips.txt"), desc="Trips")):
|
for trip_dict in read_csv("trips.txt"):
|
||||||
trip_dict: dict
|
trip_dict: dict
|
||||||
trip_id = trip_dict['trip_id']
|
trip_id = trip_dict['trip_id']
|
||||||
route_id = trip_dict['route_id']
|
route_id = trip_dict['route_id']
|
||||||
|
@ -280,7 +282,7 @@ class Command(BaseCommand):
|
||||||
trips.clear()
|
trips.clear()
|
||||||
|
|
||||||
stop_times = []
|
stop_times = []
|
||||||
for stop_time_dict in csv.DictReader(tqdm(read_file("stop_times.txt"), desc="Stop times")):
|
for stop_time_dict in read_csv("stop_times.txt"):
|
||||||
stop_time_dict: dict
|
stop_time_dict: dict
|
||||||
|
|
||||||
stop_id = stop_time_dict['stop_id']
|
stop_id = stop_time_dict['stop_id']
|
||||||
|
@ -339,7 +341,7 @@ class Command(BaseCommand):
|
||||||
|
|
||||||
if "transfers.txt" in zipfile.namelist():
|
if "transfers.txt" in zipfile.namelist():
|
||||||
transfers = []
|
transfers = []
|
||||||
for transfer_dict in csv.DictReader(tqdm(read_file("transfers.txt"), desc="Transfers")):
|
for transfer_dict in read_csv("transfers.txt"):
|
||||||
transfer_dict: dict
|
transfer_dict: dict
|
||||||
from_stop_id = transfer_dict['from_stop_id']
|
from_stop_id = transfer_dict['from_stop_id']
|
||||||
to_stop_id = transfer_dict['to_stop_id']
|
to_stop_id = transfer_dict['to_stop_id']
|
||||||
|
@ -370,7 +372,7 @@ class Command(BaseCommand):
|
||||||
transfers.clear()
|
transfers.clear()
|
||||||
|
|
||||||
if "feed_info.txt" in zipfile.namelist() and not dry_run:
|
if "feed_info.txt" in zipfile.namelist() and not dry_run:
|
||||||
for feed_info_dict in csv.DictReader(tqdm(read_file("feed_info.txt"), desc="Feed info")):
|
for feed_info_dict in read_csv("feed_info.txt"):
|
||||||
feed_info_dict: dict
|
feed_info_dict: dict
|
||||||
FeedInfo.objects.update_or_create(
|
FeedInfo.objects.update_or_create(
|
||||||
publisher_name=feed_info_dict['feed_publisher_name'],
|
publisher_name=feed_info_dict['feed_publisher_name'],
|
||||||
|
|
Loading…
Reference in New Issue