film-dev-cost-scraper/scraper/scrapers.py
2024-08-26 01:08:35 +01:00

534 lines
19 KiB
Python

import re
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
import selenium.common.exceptions
class _BaseScraper:
def __init__(self, driver):
self.driver = driver
class PPPCamera(_BaseScraper):
def scrape(self) -> list[dict]:
self.driver.get("https://pppcameras.co.uk/lab/p/35mm-film")
values_to_select = [
# aria-label value, option value to select
("Select Services", "Dev + Mid Res"),
("Select Full Frame scans", "No"),
("Select Prints", "No Prints"),
]
for (aria_label, option_value) in values_to_select:
elem = self.driver.find_element(
By.CSS_SELECTOR, f"select[aria-label={repr(aria_label)}]"
)
Select(elem).select_by_value(option_value)
elem = self.driver.find_element(By.CSS_SELECTOR, "div.product-price")
return [
{
"lab": "PPP Cameras",
"chemistry": "C41",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "no",
"sendShippingType": "",
"returnShippingCost": "0",
"returnShippingType": "Unspecified",
"cost": elem.text.replace("£", ""),
"resolution": "3637x2433",
"resolutionName": "Mid",
"url": "https://pppcameras.co.uk/lab/p/35mm-film",
}
]
class AnalogueWonderland(_BaseScraper):
def scrape(self) -> list[dict]:
return [
{
"lab": "Analogue Wonderland",
"chemistry": "C41",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "yes",
"sendShippingType": "Royal Mail Tracked 48",
"returnShippingCost": "3",
"returnShippingType": "Unspecified",
"cost": self._scrape_35mm_with_options(
[
# title of thing to click
"Colour (C-41)",
"Standard Scans",
"Correct and Rotate",
]
),
"resolution": "3024x2005",
"resolutionName": "Standard",
"url": "https://analoguewonderland.co.uk/products/35mm-film-development",
},
{
"lab": "Analogue Wonderland",
"chemistry": "B&W",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "yes",
"sendShippingType": "Royal Mail Tracked 48",
"returnShippingCost": "3",
"returnShippingType": "Unspecified",
"cost": self._scrape_35mm_with_options(
[
# title of thing to click
"Black and White",
"Standard Scans",
"Correct and Rotate",
]
),
"resolution": "3024x2005",
"resolutionName": "Standard",
"url": "https://analoguewonderland.co.uk/products/35mm-film-development",
},
]
def _scrape_35mm_with_options(self, opts: list[str]) -> str:
# opts is a list of titles of buttons to click
self.driver.get(
"https://analoguewonderland.co.uk/products/35mm-film-development"
)
try:
# wait for "free film!!1" popup
elem = WebDriverWait(self.driver, 5).until(
expected_conditions.presence_of_element_located(
(By.CSS_SELECTOR, '[aria-label="Close dialog"]')
)
)
elem.click()
time.sleep(1) # wait for animation to play
except selenium.common.exceptions.TimeoutException:
pass
# make sure it's possible to see the buttons (will raise an exception if we try to click something that's off of the page) by scrolling to the review widget
self.driver.execute_script(
"arguments[0].scrollIntoView(true)",
self.driver.find_element(By.CSS_SELECTOR, ".jdgm-prev-badge__text"),
)
for title in opts:
elem = self.driver.find_element(
By.CSS_SELECTOR, f"label.block-swatch__item[title={repr(title)}]"
)
elem.click()
return self.driver.find_element(
By.CSS_SELECTOR, "span.price > span.money"
).text.replace("£", "")
class Minilab(_BaseScraper):
def scrape(self) -> list[dict]:
c41 = {
"lab": "The Minilab",
"chemistry": "C41",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "no",
"sendShippingType": "",
"returnShippingCost": "0.85",
"returnShippingType": "Royal Mail 48",
"resolution": "3024x2005",
"resolutionName": "High JPEG",
"url": "https://www.theminilab.co.uk/product-page/c41-dev-scan",
}
c41["cost"] = self._scrape_35mm_with_url(c41["url"])
bw = {
"lab": "The Minilab",
"chemistry": "B&W",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "no",
"sendShippingType": "",
"returnShippingCost": "0.85",
"returnShippingType": "Royal Mail 48",
"resolution": "3024x2005",
"resolutionName": "High JPEG",
"url": "https://www.theminilab.co.uk/product-page/b-w-35mm-dev-scan",
}
bw["cost"] = self._scrape_35mm_with_url(bw["url"])
return [c41, bw]
def _scrape_35mm_with_url(self, url) -> str:
self.driver.get(url)
res_elem_found = WebDriverWait(self.driver, 10).until(
expected_conditions.text_to_be_present_in_element(
(By.CSS_SELECTOR, "label[for]"), "Resolution"
),
) # Waiting for the dynamically created form to be dynamiced out of thin air
assert res_elem_found
dropdown_items = self.driver.find_elements(
By.CSS_SELECTOR, 'div[data-hook="dropdown-base-text"]'
)
select_item = None
for item in dropdown_items:
if item.text == "Select":
select_item = item
break
assert select_item is not None
self.driver.execute_script("arguments[0].scrollIntoView(true)", select_item)
select_item.click()
dropdown_items = self.driver.find_elements(
By.CSS_SELECTOR, "span[aria-hidden=false]"
)
high_res_item = None
for item in dropdown_items:
if item.text == "High Res JPEG":
high_res_item = item
break
assert high_res_item is not None
high_res_item.click()
return self.driver.find_element(
By.CSS_SELECTOR, "span[data-wix-price]"
).text.replace("£", "")
class FilmProcessingCoUk(_BaseScraper):
def scrape(self) -> list[dict]:
c41 = {
"lab": "FilmProcessing.co.uk",
"chemistry": "C41",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "no",
"sendShippingType": "",
"returnShippingCost": "0",
"returnShippingType": "Royal Mail 48",
"resolution": "2728x1830",
"resolutionName": "Standard",
"url": "https://www.filmprocessing.co.uk/onlinestore/35mm-Colour-Film-Processing-p68571250",
}
c41["cost"] = self._scrape_35mm_with_url_and_opts(
c41["url"],
[
("Exposure", "Up to 39 Exposure"),
("Print Size", "No Prints Required"),
("Extra Sets (Per Film)", "No Extra Set Required"),
("Film to CD / Dropbox", "Medium Quality Dropbox"),
],
)
bw = {
"lab": "FilmProcessing.co.uk",
"chemistry": "B&W",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "no",
"sendShippingType": "",
"returnShippingCost": "0",
"returnShippingType": "Royal Mail 48",
"resolution": "2728x1830",
"resolutionName": "Standard",
"url": "https://www.filmprocessing.co.uk/onlinestore/35mm-Black-&-White-Film-Processing-p345592049",
}
bw["cost"] = self._scrape_35mm_with_url_and_opts(
bw["url"],
[
("Exposures", "Up to 39 Exposures"),
("Print Size", "No Prints Required"),
("Extra Sets (per Film)", "No Extra Sets Required"),
("Film to CD / Dropbox", "Medium Quality Dropbox"),
],
)
return [c41, bw]
def _scrape_35mm_with_url_and_opts(
self, url: str, opts: list[tuple[str, str]]
) -> str:
# opts tuples are aria-label value, option value to select
self.driver.get(url)
WebDriverWait(self.driver, 10).until(
expected_conditions.presence_of_element_located(
(By.CSS_SELECTOR, 'iframe[aria-label="Online Store"][src]')
)
) # wait for one iframe to get its source
all_iframes = self.driver.find_elements(
By.CSS_SELECTOR, 'iframe[aria-label="Online Store"][src]'
) # get all iframes
# search for the frame that contains the store options
target_iframe = None
for frame in all_iframes:
if url.split("/")[-1] not in frame.get_attribute("src"):
continue
target_iframe = frame
continue
assert target_iframe is not None
self.driver.get(target_iframe.get_attribute("src"))
elem = WebDriverWait(self.driver, 10).until(
expected_conditions.text_to_be_present_in_element(
(By.CSS_SELECTOR, "div.product-details-module__title"), "Exposure"
)
) # wait for form to be dynamically loaded in
for (aria_label, option_value) in opts:
elem = self.driver.find_element(
By.CSS_SELECTOR, f"select[aria-label={repr(aria_label)}]"
)
Select(elem).select_by_value(option_value)
return self.driver.find_element(
By.CSS_SELECTOR, "span.details-product-price__value"
).text.replace("£", "")
class AGPhotoLab(_BaseScraper):
def scrape(self) -> list[dict]:
c41 = {
"lab": "AG Photo Lab",
"chemistry": "C41",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "yes",
"sendShippingType": "Freepost",
"returnShippingCost": "4.94",
"returnShippingType": "Royal Mail 24",
"resolution": "3089x2048",
"resolutionName": "Standard JPEG",
"url": "https://www.ag-photolab.co.uk/product/c41/",
}
c41["cost"] = self._scrape_35mm_with_url_and_options(
c41["url"],
[
("5c8fbe78a2c805.23255089", "35mm_0"), # film format
("666aa5b7aab344.41469556", "Standard sleeving_0"), # film sleeving
("5c8fcb67a26bd1.60477546", "Standard Scan 8bit JPEG_0"), # scans
(
"5c8fcbc6a26c40.29952473",
"Upload files via the web_0",
), # scan delivery
],
)
bw = {
"lab": "AG Photo Lab",
"chemistry": "B&W",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "yes",
"sendShippingType": "Freepost",
"returnShippingCost": "4.94",
"returnShippingType": "Royal Mail 24",
"resolution": "3089x2048",
"resolutionName": "Standard JPEG",
"url": "https://www.ag-photolab.co.uk/product/black-white/",
}
bw["cost"] = self._scrape_35mm_with_url_and_options(
bw["url"],
[
("5c90be26ccc352.83454456", "35mm_0"), # film format
("5c90c037ccc3d4.45704796", "Standard Sleeving_0"), # film sleeving
("5c90be26ccc341.38603868", "Standard Scan 8bit JPEG_0"), # scans
(
"5c90c097ccc3e6.45684541",
"Upload files via the web_0",
), # scan delivery
],
)
return [c41, bw]
def _scrape_35mm_with_url_and_options(
self, url: str, opts: list[tuple[str, str]]
) -> str:
# opts are data-uniqid value, option value to select
self.driver.get(url)
try:
elem = (
WebDriverWait(self.driver, 3)
.until(
expected_conditions.presence_of_element_located(
(By.CSS_SELECTOR, "button.cky-btn-reject")
)
)
.click()
) # this cookie popup is big enough that i can see it causing issues so we'll actually get rid of it here
except selenium.common.exceptions.TimeoutException:
pass
elem = WebDriverWait(self.driver, 10).until(
expected_conditions.presence_of_element_located(
(By.CSS_SELECTOR, f'div[data-uniqid="{opts[0][0]}"]')
)
) # wait for the selection boxes to appear
self.driver.execute_script(
"arguments[0].scrollIntoView(true)",
self.driver.find_element(By.CSS_SELECTOR, "h1.product_title"),
) # make sure it's possible to see the selection boxes
for (aria_label, option_value) in opts:
elem = self.driver.find_element(
By.CSS_SELECTOR, f"[data-uniqid={repr(aria_label)}]"
)
elem = elem.find_element(By.TAG_NAME, "select")
Select(elem).select_by_value(option_value)
return (
self.driver.find_element(By.CSS_SELECTOR, "span.price.amount.final")
.text.replace(" ", "")
.replace("£", "")
)
class HarmanLab(_BaseScraper):
def scrape(self) -> list[dict]:
c41 = {
"lab": "Harman Lab",
"chemistry": "C41",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "no",
"sendShippingType": "",
"returnShippingCost": "2.95",
"returnShippingType": "Royal Mail 24",
"resolution": "1500x2250",
"resolutionName": "Std",
"url": "https://harmanlab.com/products/developing-only-135-colour-c41-film?variant=42500108189938",
}
c41["cost"] = self._scrape_with_url(c41["url"])
bw = {
"lab": "Harman Lab",
"chemistry": "B&W",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "no",
"sendShippingType": "",
"returnShippingCost": "2.95",
"returnShippingType": "Royal Mail 24",
"resolution": "1500x2250",
"resolutionName": "Std",
"url": "https://harmanlab.com/products/black-and-white-film-developing-only?variant=42499934716146",
}
bw["cost"] = self._scrape_with_url(bw["url"])
return [c41, bw]
def _scrape_with_url(self, url: str) -> str:
self.driver.get(url)
return (
self.driver.find_element(
By.CSS_SELECTOR, "span.price-item.price-item--regular"
)
.text.replace(" GBP", "")
.replace("£", "")
)
class TheFilmSafe(_BaseScraper):
# Note for the future: they have a bulk discount of £1 per roll
def scrape(self) -> list[dict]:
return [
{
"lab": "The Film Safe",
"chemistry": "C41",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "no",
"sendShippingType": "",
"returnShippingCost": "2",
"returnShippingType": "Royal Mail 48",
"cost": self._get_price_for_options(
[
("product-dropdown-1_2", "C41 (colour)"), # film process
("product-dropdown-3_4_18", "35mm"), # film format
(
"product-dropdown-13_14_16_17",
"Med Res JPEG (£9)",
), # image resolution
(
"product-dropdown-9_10_11",
"CALM",
), # charity choice (required to get a price)
]
),
"resolution": "3100x2100",
"resolutionName": "Med",
"url": "https://www.thefilmsafe.co.uk/product-page/developing-scanning",
},
{
"lab": "The Film Safe",
"chemistry": "B&W",
"format": "35mm",
"subformat": "full frame",
"includesSendShipping": "no",
"sendShippingType": "",
"returnShippingCost": "2",
"returnShippingType": "Royal Mail 48",
"cost": self._get_price_for_options(
[
("product-dropdown-1_2", "BW"), # film process
("product-dropdown-3_4_18", "35mm"), # film format
(
"product-dropdown-13_14_16_17",
"Med Res JPEG (£9)",
), # image resolution
(
"product-dropdown-9_10_11",
"CALM",
), # charity choice (required to get a price)
]
),
"resolution": "3100x2100",
"resolutionName": "Med",
"url": "https://www.thefilmsafe.co.uk/product-page/developing-scanning",
},
]
def _get_price_for_options(self, opts: list[tuple[str, str]]) -> str:
# tuples are (id of corresponding label, title of div to click)
self.driver.get(
"https://www.thefilmsafe.co.uk/product-page/developing-scanning"
)
WebDriverWait(self.driver, 10).until(
expected_conditions.presence_of_element_located(
(By.CSS_SELECTOR, "label#product-dropdown-1_2[for]")
)
) # wait for the dropdown options to get stitched together to the form boxes
for (label_id, option_title) in opts:
elem = self.driver.find_element(
By.CSS_SELECTOR, f"button[aria-labelledby={repr(label_id)}]"
)
elem.click()
elem = self.driver.find_element(
By.CSS_SELECTOR,
f'div[data-hook="popover-content"] div[title={repr(option_title)}]',
)
elem.click()
return self.driver.find_element(
By.CSS_SELECTOR, "span[data-wix-price]"
).text.replace("£", "")
# TODO: https://www.exposurefilmlab.com/