Add scraper

Signed-off-by: AKP <tom@tdpain.net>
This commit is contained in:
akp 2022-11-07 23:18:29 +00:00
parent 23055fc798
commit 9139137858
No known key found for this signature in database
GPG key ID: AA5726202C8879B7
2 changed files with 183 additions and 0 deletions

View file

@ -0,0 +1,89 @@
from typing import *
from enum import Enum
from dataclasses import dataclass
import requests
from bs4 import BeautifulSoup
class MachineState(Enum):
Available = "AVAIL"
InUse = "IN_USE"
Completed = "COMPLETED"
Unknown = "UNKNOWN"
class MachineType(Enum):
Washer = "WASHER"
Dryer = "DRYER"
@dataclass(init=False, repr=True)
class Machine:
number: str
type: MachineType
state: MachineState
minutes_remaining: Optional[int]
class CircuitScraper:
_base_url: str = "https://www.circuit.co.uk/circuit-view/laundry-site"
_class_washer = "accordion__title"
_class_dryer = "accordion__title--dryer"
_class_in_use = "accordion__title--in-use"
_class_completed = "accordion__title--idle"
_class_state_unknown = "accordion__title--unknown"
@staticmethod
def _get_site_url(site_id: str) -> str:
return CircuitScraper._base_url + f"/?site={site_id}"
@staticmethod
def get_site_machine_states(site_id: str) -> List[Machine]:
site_url = CircuitScraper._get_site_url(site_id)
r = requests.get(site_url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
machines = []
for item in soup.select("section.accordions--circuit-view.js-machine-type"):
machines += list(item.select("div.accordion"))
for item in machines:
states = item.select("div.accordion__slug div.accordion__title")
if len(states) == 0:
continue
attr_classes = states[0].attrs.get("class", [])
machine = Machine()
descriptor_text = states[0].get_text().lower()
machine.type = MachineType.Dryer if "dryer" in descriptor_text else MachineType.Washer
machine.number = descriptor_text.replace("washer", "").replace("dryer", "").strip()
# Note that CircuitScraper._class_washer is included on every item, hence if it's none of the other ones are
# present, we fall back to that one.
if CircuitScraper._class_in_use in attr_classes:
machine.state = MachineState.InUse
elif CircuitScraper._class_completed in attr_classes:
machine.state = MachineState.Completed
elif CircuitScraper._class_state_unknown in attr_classes:
machine.state = MachineState.Unknown
elif CircuitScraper._class_dryer in attr_classes:
# Technically we could just pretend this one doesn't exist, but we'll keep it in for posterity's sake.
machine.state = MachineState.Available
else:
machine.state = MachineState.Available
if machine.state == MachineState.InUse:
minutes_remaining_text = item.select("p span")[0].get_text(strip=True)
machine.minutes_remaining = int(minutes_remaining_text.replace("mins", "").strip())
else:
machine.minutes_remaining = None
machines.append(machine)
return machines

94
poetry.lock generated Normal file
View file

@ -0,0 +1,94 @@
[[package]]
name = "beautifulsoup4"
version = "4.11.1"
description = "Screen-scraping library"
category = "main"
optional = false
python-versions = ">=3.6.0"
[package.dependencies]
soupsieve = ">1.2"
[package.extras]
html5lib = ["html5lib"]
lxml = ["lxml"]
[[package]]
name = "certifi"
version = "2022.9.24"
description = "Python package for providing Mozilla's CA Bundle."
category = "main"
optional = false
python-versions = ">=3.6"
[[package]]
name = "charset-normalizer"
version = "2.1.1"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
category = "main"
optional = false
python-versions = ">=3.6.0"
[package.extras]
unicode_backport = ["unicodedata2"]
[[package]]
name = "idna"
version = "3.4"
description = "Internationalized Domain Names in Applications (IDNA)"
category = "main"
optional = false
python-versions = ">=3.5"
[[package]]
name = "requests"
version = "2.28.1"
description = "Python HTTP for Humans."
category = "main"
optional = false
python-versions = ">=3.7, <4"
[package.dependencies]
certifi = ">=2017.4.17"
charset-normalizer = ">=2,<3"
idna = ">=2.5,<4"
urllib3 = ">=1.21.1,<1.27"
[package.extras]
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "soupsieve"
version = "2.3.2.post1"
description = "A modern CSS selector implementation for Beautiful Soup."
category = "main"
optional = false
python-versions = ">=3.6"
[[package]]
name = "urllib3"
version = "1.26.12"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4"
[package.extras]
brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"]
secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"]
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
content-hash = "f387c917af52c11962400d12327a47dde4e322812f89078f6592ed4809432d7a"
[metadata.files]
beautifulsoup4 = []
certifi = []
charset-normalizer = []
idna = []
requests = []
soupsieve = []
urllib3 = []