Pull station data from National Rail station index

This commit is contained in:
akp 2023-10-29 18:29:41 +00:00
parent 88857c0992
commit 7a70dd56b7
No known key found for this signature in database
GPG key ID: CF8D58F3DEB20755
3 changed files with 98 additions and 4 deletions

3
.gitignore vendored
View file

@ -1 +1,2 @@
run/
/run/
NRT_*_Station_Index.pdf

View file

@ -3,6 +3,86 @@
import requests
import json
import sys
import re
from pypdf import PdfReader
NRT_STATION_INDEX_FILENAME = sys.argv[1]
def case_insensitive_rstrip(inp, snip):
snip = snip.lower()
if inp.lower().endswith(snip):
return inp[:-len(snip)].strip()
return inp
def is_extraneous_code(code):
dprs_codes = {
# Source: http://www.railwaycodes.org.uk/crs/crs2.shtm
"EBF": "EBD", # Ebbsfleet International
"GCL": "GLC", # Glasgow Central
"GQL": "GLQ", # Glasgow Queen Street
"HEZ": "HEW", # Heworth
"HII": "HHY", # Highbury & Islington
"XHZ": "HHY",
"LIF": "LTV", # Lichfield Trent Valley
"LVL": "LIV", # Liverpool Lime Street
"ALE": "LPY", # Liverpool South Parkway
"SPL": "STP", # London St Pancras
"SPX": "STP",
"XRO": "RET", # Retford
"GTI": "SGB", # Smethwick Galton Bridge
"TAH": "TAM", # Tamworth
"WJH": "WIJ", # Willesden Junction
"WJL": "WIJ",
"WPH": "WOP", # Worcestershire Parkway
}
return code.upper() in dprs_codes
def get_names_from_pdf(filename):
reader = PdfReader(filename)
def extract_text_from_page(page):
items = []
def visitor_body(text, cm, tm, font_dict, font_size):
x, y = tm[4], tm[5]
if text == "" or (x == 0 and y == 0):
return
items.append((text, x, y))
page.extract_text(visitor_text=visitor_body)
return items
stations = {}
print("Processing station index PDF", file=sys.stderr)
for page_number in range(len(reader.pages)):
items = extract_text_from_page(reader.pages[page_number])
i = 0
while i < len(items):
x = items[i][0].strip()
try:
next_text = items[i+1][0]
except IndexError:
next_text = ""
if re.match("[A-Z]{3}", x) and next_text != "\n" and "(continued)" not in next_text:
stations[x] = items[i+1][0].strip()
i += 1
return stations
url = "https://overpass-api.de/api/interpreter"
@ -25,13 +105,26 @@ for elem in rj.get("elements", []):
tags = elem.get("tags", {})
crs = tags.get("ref:crs")
if crs is None:
if crs is None or is_extraneous_code(crs):
continue
name = tags.get("name", "")
name = case_insensitive_rstrip(name, "high level")
name = case_insensitive_rstrip(name, "low level")
out[crs] = {
"lat": elem.get("lat", 0),
"lon": elem.get("lon", 0),
"name": tags.get("name", ""),
"name": name,
}
nrt_names = get_names_from_pdf(NRT_STATION_INDEX_FILENAME)
print("Overwriting station names with NRT names", file=sys.stderr)
for crs in out:
if crs in nrt_names:
out[crs]["name"] = nrt_names[crs]
json.dump(out, open("stationData.json", "w"))

File diff suppressed because one or more lines are too long