Pull station data from National Rail station index

2023-10-29 18:29:41 +00:00 · 2023-10-29 18:29:41 +00:00 · 7a70dd56b7
commit 7a70dd56b7
parent 88857c0992
3 changed files with 98 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-run/
+/run/
+NRT_*_Station_Index.pdf
--- a/railmiles/internal/core/getStationData.py
+++ b/railmiles/internal/core/getStationData.py
@ -3,6 +3,86 @@
 import requests
 import json
 import sys
+import re
+from pypdf import PdfReader
+
+
+NRT_STATION_INDEX_FILENAME = sys.argv[1]
+
+
+def case_insensitive_rstrip(inp, snip):
+    snip = snip.lower()
+    if inp.lower().endswith(snip):
+        return inp[:-len(snip)].strip()
+    return inp
+
+
+def is_extraneous_code(code):
+    dprs_codes = {
+        # Source: http://www.railwaycodes.org.uk/crs/crs2.shtm
+        "EBF": "EBD", # Ebbsfleet International
+        "GCL": "GLC", # Glasgow Central        
+        "GQL": "GLQ", # Glasgow Queen Street   
+        "HEZ": "HEW", # Heworth                
+        "HII": "HHY", # Highbury & Islington   
+        "XHZ": "HHY",
+        "LIF": "LTV", # Lichfield Trent Valley 
+        "LVL": "LIV", # Liverpool Lime Street  
+        "ALE": "LPY", # Liverpool South Parkway
+        "SPL": "STP", # London St Pancras      
+        "SPX": "STP",
+        "XRO": "RET", # Retford                
+        "GTI": "SGB", # Smethwick Galton Bridge
+        "TAH": "TAM", # Tamworth               
+        "WJH": "WIJ", # Willesden Junction     
+        "WJL": "WIJ",
+        "WPH": "WOP", # Worcestershire Parkway 
+    }
+
+    return code.upper() in dprs_codes
+
+
+def get_names_from_pdf(filename):
+    reader = PdfReader(filename)
+
+
+    def extract_text_from_page(page):
+        items = []
+
+        def visitor_body(text, cm, tm, font_dict, font_size):
+            x, y = tm[4], tm[5]
+            if text == "" or (x == 0 and y == 0):
+                return
+
+            items.append((text, x, y))
+
+        page.extract_text(visitor_text=visitor_body)
+
+        return items
+
+
+    stations = {}
+
+    print("Processing station index PDF", file=sys.stderr)
+
+    for page_number in range(len(reader.pages)):
+        items = extract_text_from_page(reader.pages[page_number])    
+
+        i = 0
+        while i < len(items):
+            x = items[i][0].strip()
+
+            try:
+                next_text = items[i+1][0]
+            except IndexError:
+                next_text = ""
+            if re.match("[A-Z]{3}", x) and next_text != "\n" and "(continued)" not in next_text:
+                stations[x] = items[i+1][0].strip()
+
+            i += 1
+        
+    return stations
+

 url = "https://overpass-api.de/api/interpreter"

@ -25,13 +105,26 @@ for elem in rj.get("elements", []):
    tags = elem.get("tags", {})

    crs = tags.get("ref:crs")
-    if crs is None:
+    if crs is None or is_extraneous_code(crs):
        continue

+    name = tags.get("name", "")
+    name = case_insensitive_rstrip(name, "high level")
+    name = case_insensitive_rstrip(name, "low level")
+
    out[crs] = {
        "lat": elem.get("lat", 0),
        "lon": elem.get("lon", 0),
-        "name": tags.get("name", ""),
+        "name": name,
    }

+
+nrt_names = get_names_from_pdf(NRT_STATION_INDEX_FILENAME)
+
+print("Overwriting station names with NRT names", file=sys.stderr)
+
+for crs in out:
+    if crs in nrt_names:
+        out[crs]["name"] = nrt_names[crs]
+
 json.dump(out, open("stationData.json", "w"))
--- a/railmiles/internal/core/stationData.json
+++ b/railmiles/internal/core/stationData.json