import re import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import Select from selenium.webdriver.support import expected_conditions from selenium.webdriver.support.wait import WebDriverWait import selenium.common.exceptions class _BaseScraper: def __init__(self, driver): self.driver = driver class PPPCamera(_BaseScraper): def scrape(self) -> list[dict]: self.driver.get("https://pppcameras.co.uk/lab/p/35mm-film") values_to_select = [ # aria-label value, option value to select ("Select Services", "Dev + Mid Res"), ("Select Full Frame scans", "No"), ("Select Prints", "No Prints"), ] for (aria_label, option_value) in values_to_select: elem = self.driver.find_element( By.CSS_SELECTOR, f"select[aria-label={repr(aria_label)}]" ) Select(elem).select_by_value(option_value) elem = self.driver.find_element(By.CSS_SELECTOR, "div.product-price") return [ { "lab": "PPP Cameras", "chemistry": "C41", "format": "35mm", "subformat": "full frame", "includesSendShipping": "no", "sendShippingType": "", "returnShippingCost": "0", "returnShippingType": "Unspecified", "cost": elem.text.replace("£", ""), "resolution": "3637x2433", "resolutionName": "Mid", "url": "https://pppcameras.co.uk/lab/p/35mm-film", } ] class AnalogueWonderland(_BaseScraper): def scrape(self) -> list[dict]: return [ { "lab": "Analogue Wonderland", "chemistry": "C41", "format": "35mm", "subformat": "full frame", "includesSendShipping": "yes", "sendShippingType": "Royal Mail Tracked 48", "returnShippingCost": "3", "returnShippingType": "Unspecified", "cost": self._scrape_35mm_with_options( [ # title of thing to click "Colour (C-41)", "Standard Scans", "Correct and Rotate", ] ), "resolution": "3024x2005", "resolutionName": "Standard", "url": "https://analoguewonderland.co.uk/products/35mm-film-development", }, { "lab": "Analogue Wonderland", "chemistry": "B&W", "format": "35mm", "subformat": "full frame", "includesSendShipping": "yes", "sendShippingType": "Royal Mail Tracked 48", "returnShippingCost": "3", "returnShippingType": "Unspecified", "cost": self._scrape_35mm_with_options( [ # title of thing to click "Black and White", "Standard Scans", "Correct and Rotate", ] ), "resolution": "3024x2005", "resolutionName": "Standard", "url": "https://analoguewonderland.co.uk/products/35mm-film-development", }, ] def _scrape_35mm_with_options(self, opts: list[str]) -> str: # opts is a list of titles of buttons to click self.driver.get( "https://analoguewonderland.co.uk/products/35mm-film-development" ) try: # wait for "free film!!1" popup elem = WebDriverWait(self.driver, 5).until( expected_conditions.presence_of_element_located( (By.CSS_SELECTOR, '[aria-label="Close dialog"]') ) ) elem.click() time.sleep(1) # wait for animation to play except selenium.common.exceptions.TimeoutException: pass # make sure it's possible to see the buttons (will raise an exception if we try to click something that's off of the page) by scrolling to the review widget self.driver.execute_script( "arguments[0].scrollIntoView(true)", self.driver.find_element(By.CSS_SELECTOR, ".jdgm-prev-badge__text"), ) for title in opts: elem = self.driver.find_element( By.CSS_SELECTOR, f"label.block-swatch__item[title={repr(title)}]" ) elem.click() return self.driver.find_element( By.CSS_SELECTOR, "span.price > span.money" ).text.replace("£", "") class Minilab(_BaseScraper): def scrape(self) -> list[dict]: c41 = { "lab": "The Minilab", "chemistry": "C41", "format": "35mm", "subformat": "full frame", "includesSendShipping": "no", "sendShippingType": "", "returnShippingCost": "0.85", "returnShippingType": "Royal Mail 48", "resolution": "3024x2005", "resolutionName": "High JPEG", "url": "https://www.theminilab.co.uk/product-page/c41-dev-scan", } c41["cost"] = self._scrape_35mm_with_url(c41["url"]) bw = { "lab": "The Minilab", "chemistry": "B&W", "format": "35mm", "subformat": "full frame", "includesSendShipping": "no", "sendShippingType": "", "returnShippingCost": "0.85", "returnShippingType": "Royal Mail 48", "resolution": "3024x2005", "resolutionName": "High JPEG", "url": "https://www.theminilab.co.uk/product-page/b-w-35mm-dev-scan", } bw["cost"] = self._scrape_35mm_with_url(bw["url"]) return [c41, bw] def _scrape_35mm_with_url(self, url) -> str: self.driver.get(url) res_elem_found = WebDriverWait(self.driver, 10).until( expected_conditions.text_to_be_present_in_element( (By.CSS_SELECTOR, "label[for]"), "Resolution" ), ) # Waiting for the dynamically created form to be dynamiced out of thin air assert res_elem_found dropdown_items = self.driver.find_elements( By.CSS_SELECTOR, 'div[data-hook="dropdown-base-text"]' ) select_item = None for item in dropdown_items: if item.text == "Select": select_item = item break assert select_item is not None self.driver.execute_script("arguments[0].scrollIntoView(true)", select_item) select_item.click() dropdown_items = self.driver.find_elements( By.CSS_SELECTOR, "span[aria-hidden=false]" ) high_res_item = None for item in dropdown_items: if item.text == "High Res JPEG": high_res_item = item break assert high_res_item is not None high_res_item.click() return self.driver.find_element( By.CSS_SELECTOR, "span[data-wix-price]" ).text.replace("£", "") class FilmProcessingCoUk(_BaseScraper): def scrape(self) -> list[dict]: c41 = { "lab": "FilmProcessing.co.uk", "chemistry": "C41", "format": "35mm", "subformat": "full frame", "includesSendShipping": "no", "sendShippingType": "", "returnShippingCost": "0", "returnShippingType": "Royal Mail 48", "resolution": "2728x1830", "resolutionName": "Standard", "url": "https://www.filmprocessing.co.uk/onlinestore/35mm-Colour-Film-Processing-p68571250", } c41["cost"] = self._scrape_35mm_with_url_and_opts( c41["url"], [ ("Exposure", "Up to 39 Exposure"), ("Print Size", "No Prints Required"), ("Extra Sets (Per Film)", "No Extra Set Required"), ("Film to CD / Dropbox", "Medium Quality Dropbox"), ], ) bw = { "lab": "FilmProcessing.co.uk", "chemistry": "B&W", "format": "35mm", "subformat": "full frame", "includesSendShipping": "no", "sendShippingType": "", "returnShippingCost": "0", "returnShippingType": "Royal Mail 48", "resolution": "2728x1830", "resolutionName": "Standard", "url": "https://www.filmprocessing.co.uk/onlinestore/35mm-Black-&-White-Film-Processing-p345592049", } bw["cost"] = self._scrape_35mm_with_url_and_opts( bw["url"], [ ("Exposures", "Up to 39 Exposures"), ("Print Size", "No Prints Required"), ("Extra Sets (per Film)", "No Extra Sets Required"), ("Film to CD / Dropbox", "Medium Quality Dropbox"), ], ) return [c41, bw] def _scrape_35mm_with_url_and_opts( self, url: str, opts: list[tuple[str, str]] ) -> str: # opts tuples are aria-label value, option value to select self.driver.get(url) WebDriverWait(self.driver, 10).until( expected_conditions.presence_of_element_located( (By.CSS_SELECTOR, 'iframe[aria-label="Online Store"][src]') ) ) # wait for one iframe to get its source all_iframes = self.driver.find_elements( By.CSS_SELECTOR, 'iframe[aria-label="Online Store"][src]' ) # get all iframes # search for the frame that contains the store options target_iframe = None for frame in all_iframes: if url.split("/")[-1] not in frame.get_attribute("src"): continue target_iframe = frame continue assert target_iframe is not None self.driver.get(target_iframe.get_attribute("src")) elem = WebDriverWait(self.driver, 10).until( expected_conditions.text_to_be_present_in_element( (By.CSS_SELECTOR, "div.product-details-module__title"), "Exposure" ) ) # wait for form to be dynamically loaded in for (aria_label, option_value) in opts: elem = self.driver.find_element( By.CSS_SELECTOR, f"select[aria-label={repr(aria_label)}]" ) Select(elem).select_by_value(option_value) return self.driver.find_element( By.CSS_SELECTOR, "span.details-product-price__value" ).text.replace("£", "") class AGPhotoLab(_BaseScraper): def scrape(self) -> list[dict]: c41 = { "lab": "AG Photo Lab", "chemistry": "C41", "format": "35mm", "subformat": "full frame", "includesSendShipping": "yes", "sendShippingType": "Freepost", "returnShippingCost": "4.94", "returnShippingType": "Royal Mail 24", "resolution": "3089x2048", "resolutionName": "Standard JPEG", "url": "https://www.ag-photolab.co.uk/product/c41/", } c41["cost"] = self._scrape_35mm_with_url_and_options( c41["url"], [ ("5c8fbe78a2c805.23255089", "35mm_0"), # film format ("666aa5b7aab344.41469556", "Standard sleeving_0"), # film sleeving ("5c8fcb67a26bd1.60477546", "Standard Scan 8bit JPEG_0"), # scans ( "5c8fcbc6a26c40.29952473", "Upload files via the web_0", ), # scan delivery ], ) bw = { "lab": "AG Photo Lab", "chemistry": "B&W", "format": "35mm", "subformat": "full frame", "includesSendShipping": "yes", "sendShippingType": "Freepost", "returnShippingCost": "4.94", "returnShippingType": "Royal Mail 24", "resolution": "3089x2048", "resolutionName": "Standard JPEG", "url": "https://www.ag-photolab.co.uk/product/black-white/", } bw["cost"] = self._scrape_35mm_with_url_and_options( bw["url"], [ ("5c90be26ccc352.83454456", "35mm_0"), # film format ("5c90c037ccc3d4.45704796", "Standard Sleeving_0"), # film sleeving ("5c90be26ccc341.38603868", "Standard Scan 8bit JPEG_0"), # scans ( "5c90c097ccc3e6.45684541", "Upload files via the web_0", ), # scan delivery ], ) return [c41, bw] def _scrape_35mm_with_url_and_options( self, url: str, opts: list[tuple[str, str]] ) -> str: # opts are data-uniqid value, option value to select self.driver.get(url) try: elem = ( WebDriverWait(self.driver, 3) .until( expected_conditions.presence_of_element_located( (By.CSS_SELECTOR, "button.cky-btn-reject") ) ) .click() ) # this cookie popup is big enough that i can see it causing issues so we'll actually get rid of it here except selenium.common.exceptions.TimeoutException: pass elem = WebDriverWait(self.driver, 10).until( expected_conditions.presence_of_element_located( (By.CSS_SELECTOR, f'div[data-uniqid="{opts[0][0]}"]') ) ) # wait for the selection boxes to appear self.driver.execute_script( "arguments[0].scrollIntoView(true)", self.driver.find_element(By.CSS_SELECTOR, "h1.product_title"), ) # make sure it's possible to see the selection boxes for (aria_label, option_value) in opts: elem = self.driver.find_element( By.CSS_SELECTOR, f"[data-uniqid={repr(aria_label)}]" ) elem = elem.find_element(By.TAG_NAME, "select") Select(elem).select_by_value(option_value) return ( self.driver.find_element(By.CSS_SELECTOR, "span.price.amount.final") .text.replace(" ", "") .replace("£", "") ) class HarmanLab(_BaseScraper): def scrape(self) -> list[dict]: c41 = { "lab": "Harman Lab", "chemistry": "C41", "format": "35mm", "subformat": "full frame", "includesSendShipping": "no", "sendShippingType": "", "returnShippingCost": "2.95", "returnShippingType": "Royal Mail 24", "resolution": "1500x2250", "resolutionName": "Std", "url": "https://harmanlab.com/products/developing-only-135-colour-c41-film?variant=42500108189938", } c41["cost"] = self._scrape_with_url(c41["url"]) bw = { "lab": "Harman Lab", "chemistry": "B&W", "format": "35mm", "subformat": "full frame", "includesSendShipping": "no", "sendShippingType": "", "returnShippingCost": "2.95", "returnShippingType": "Royal Mail 24", "resolution": "1500x2250", "resolutionName": "Std", "url": "https://harmanlab.com/products/black-and-white-film-developing-only?variant=42499934716146", } bw["cost"] = self._scrape_with_url(bw["url"]) return [c41, bw] def _scrape_with_url(self, url: str) -> str: self.driver.get(url) return ( self.driver.find_element( By.CSS_SELECTOR, "span.price-item.price-item--regular" ) .text.replace(" GBP", "") .replace("£", "") ) class TheFilmSafe(_BaseScraper): # Note for the future: they have a bulk discount of £1 per roll def scrape(self) -> list[dict]: return [ { "lab": "The Film Safe", "chemistry": "C41", "format": "35mm", "subformat": "full frame", "includesSendShipping": "no", "sendShippingType": "", "returnShippingCost": "2", "returnShippingType": "Royal Mail 48", "cost": self._get_price_for_options( [ ("product-dropdown-1_2", "C41 (colour)"), # film process ("product-dropdown-3_4_18", "35mm"), # film format ( "product-dropdown-13_14_16_17", "Med Res JPEG (£9)", ), # image resolution ( "product-dropdown-9_10_11", "CALM", ), # charity choice (required to get a price) ] ), "resolution": "3100x2100", "resolutionName": "Med", "url": "https://www.thefilmsafe.co.uk/product-page/developing-scanning", }, { "lab": "The Film Safe", "chemistry": "B&W", "format": "35mm", "subformat": "full frame", "includesSendShipping": "no", "sendShippingType": "", "returnShippingCost": "2", "returnShippingType": "Royal Mail 48", "cost": self._get_price_for_options( [ ("product-dropdown-1_2", "BW"), # film process ("product-dropdown-3_4_18", "35mm"), # film format ( "product-dropdown-13_14_16_17", "Med Res JPEG (£9)", ), # image resolution ( "product-dropdown-9_10_11", "CALM", ), # charity choice (required to get a price) ] ), "resolution": "3100x2100", "resolutionName": "Med", "url": "https://www.thefilmsafe.co.uk/product-page/developing-scanning", }, ] def _get_price_for_options(self, opts: list[tuple[str, str]]) -> str: # tuples are (id of corresponding label, title of div to click) self.driver.get( "https://www.thefilmsafe.co.uk/product-page/developing-scanning" ) WebDriverWait(self.driver, 10).until( expected_conditions.presence_of_element_located( (By.CSS_SELECTOR, "label#product-dropdown-1_2[for]") ) ) # wait for the dropdown options to get stitched together to the form boxes for (label_id, option_title) in opts: elem = self.driver.find_element( By.CSS_SELECTOR, f"button[aria-labelledby={repr(label_id)}]" ) elem.click() elem = self.driver.find_element( By.CSS_SELECTOR, f'div[data-hook="popover-content"] div[title={repr(option_title)}]', ) elem.click() return self.driver.find_element( By.CSS_SELECTOR, "span[data-wix-price]" ).text.replace("£", "") # TODO: https://www.exposurefilmlab.com/