From daa788ba799176aa06d4f970fdbc461889bceee7 Mon Sep 17 00:00:00 2001 From: AKP Date: Mon, 26 Aug 2024 01:08:35 +0100 Subject: [PATCH] Revert "Revert "Initial commit"" This reverts commit 3ea984c2eff81e484a1bcca72d2b7cf3e90b4855. --- bundle.sh | 13 + poetry.lock | 367 ++++++++++++++++++++++++++ postprocessor/__main__.py | 245 +++++++++++++++++ postprocessor/page.js | 9 + pyproject.toml | 17 ++ pyrightconfig.json | 4 + scraper/__main__.py | 51 ++++ scraper/scrapers.py | 534 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 1240 insertions(+) create mode 100644 bundle.sh create mode 100644 poetry.lock create mode 100644 postprocessor/__main__.py create mode 100644 postprocessor/page.js create mode 100644 pyproject.toml create mode 100644 pyrightconfig.json create mode 100644 scraper/__main__.py create mode 100644 scraper/scrapers.py diff --git a/bundle.sh b/bundle.sh new file mode 100644 index 0000000..aff5496 --- /dev/null +++ b/bundle.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +set -ex + +DATAFILE=$1 +DIR=$(mktemp -p . -d) + +poetry run python3 postprocessor/ $DATAFILE $DIR/index.html +cp $DATAFILE $DIR/rawdata.json + +zip --junk-paths bundle.zip $DIR/* + +rm -r $DIR \ No newline at end of file diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..0fcddc3 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,367 @@ +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. + +[[package]] +name = "attrs" +version = "24.2.0" +description = "Classes Without Boilerplate" +optional = false +python-versions = ">=3.7" +files = [ + {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, + {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, +] + +[package.extras] +benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] + +[[package]] +name = "certifi" +version = "2024.7.4" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, + {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, +] + +[[package]] +name = "cffi" +version = "1.17.0" +description = "Foreign Function Interface for Python calling C code." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cffi-1.17.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f9338cc05451f1942d0d8203ec2c346c830f8e86469903d5126c1f0a13a2bcbb"}, + {file = "cffi-1.17.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a0ce71725cacc9ebf839630772b07eeec220cbb5f03be1399e0457a1464f8e1a"}, + {file = "cffi-1.17.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c815270206f983309915a6844fe994b2fa47e5d05c4c4cef267c3b30e34dbe42"}, + {file = "cffi-1.17.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6bdcd415ba87846fd317bee0774e412e8792832e7805938987e4ede1d13046d"}, + {file = "cffi-1.17.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a98748ed1a1df4ee1d6f927e151ed6c1a09d5ec21684de879c7ea6aa96f58f2"}, + {file = "cffi-1.17.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0a048d4f6630113e54bb4b77e315e1ba32a5a31512c31a273807d0027a7e69ab"}, + {file = "cffi-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24aa705a5f5bd3a8bcfa4d123f03413de5d86e497435693b638cbffb7d5d8a1b"}, + {file = "cffi-1.17.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:856bf0924d24e7f93b8aee12a3a1095c34085600aa805693fb7f5d1962393206"}, + {file = "cffi-1.17.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4304d4416ff032ed50ad6bb87416d802e67139e31c0bde4628f36a47a3164bfa"}, + {file = "cffi-1.17.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:331ad15c39c9fe9186ceaf87203a9ecf5ae0ba2538c9e898e3a6967e8ad3db6f"}, + {file = "cffi-1.17.0-cp310-cp310-win32.whl", hash = "sha256:669b29a9eca6146465cc574659058ed949748f0809a2582d1f1a324eb91054dc"}, + {file = "cffi-1.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:48b389b1fd5144603d61d752afd7167dfd205973a43151ae5045b35793232aa2"}, + {file = "cffi-1.17.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c5d97162c196ce54af6700949ddf9409e9833ef1003b4741c2b39ef46f1d9720"}, + {file = "cffi-1.17.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5ba5c243f4004c750836f81606a9fcb7841f8874ad8f3bf204ff5e56332b72b9"}, + {file = "cffi-1.17.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bb9333f58fc3a2296fb1d54576138d4cf5d496a2cc118422bd77835e6ae0b9cb"}, + {file = "cffi-1.17.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:435a22d00ec7d7ea533db494da8581b05977f9c37338c80bc86314bec2619424"}, + {file = "cffi-1.17.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1df34588123fcc88c872f5acb6f74ae59e9d182a2707097f9e28275ec26a12d"}, + {file = "cffi-1.17.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:df8bb0010fdd0a743b7542589223a2816bdde4d94bb5ad67884348fa2c1c67e8"}, + {file = "cffi-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8b5b9712783415695663bd463990e2f00c6750562e6ad1d28e072a611c5f2a6"}, + {file = "cffi-1.17.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ffef8fd58a36fb5f1196919638f73dd3ae0db1a878982b27a9a5a176ede4ba91"}, + {file = "cffi-1.17.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e67d26532bfd8b7f7c05d5a766d6f437b362c1bf203a3a5ce3593a645e870b8"}, + {file = "cffi-1.17.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45f7cd36186db767d803b1473b3c659d57a23b5fa491ad83c6d40f2af58e4dbb"}, + {file = "cffi-1.17.0-cp311-cp311-win32.whl", hash = "sha256:a9015f5b8af1bb6837a3fcb0cdf3b874fe3385ff6274e8b7925d81ccaec3c5c9"}, + {file = "cffi-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:b50aaac7d05c2c26dfd50c3321199f019ba76bb650e346a6ef3616306eed67b0"}, + {file = "cffi-1.17.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aec510255ce690d240f7cb23d7114f6b351c733a74c279a84def763660a2c3bc"}, + {file = "cffi-1.17.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2770bb0d5e3cc0e31e7318db06efcbcdb7b31bcb1a70086d3177692a02256f59"}, + {file = "cffi-1.17.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db9a30ec064129d605d0f1aedc93e00894b9334ec74ba9c6bdd08147434b33eb"}, + {file = "cffi-1.17.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a47eef975d2b8b721775a0fa286f50eab535b9d56c70a6e62842134cf7841195"}, + {file = "cffi-1.17.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f3e0992f23bbb0be00a921eae5363329253c3b86287db27092461c887b791e5e"}, + {file = "cffi-1.17.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6107e445faf057c118d5050560695e46d272e5301feffda3c41849641222a828"}, + {file = "cffi-1.17.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb862356ee9391dc5a0b3cbc00f416b48c1b9a52d252d898e5b7696a5f9fe150"}, + {file = "cffi-1.17.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c1c13185b90bbd3f8b5963cd8ce7ad4ff441924c31e23c975cb150e27c2bf67a"}, + {file = "cffi-1.17.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:17c6d6d3260c7f2d94f657e6872591fe8733872a86ed1345bda872cfc8c74885"}, + {file = "cffi-1.17.0-cp312-cp312-win32.whl", hash = "sha256:c3b8bd3133cd50f6b637bb4322822c94c5ce4bf0d724ed5ae70afce62187c492"}, + {file = "cffi-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:dca802c8db0720ce1c49cce1149ff7b06e91ba15fa84b1d59144fef1a1bc7ac2"}, + {file = "cffi-1.17.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6ce01337d23884b21c03869d2f68c5523d43174d4fc405490eb0091057943118"}, + {file = "cffi-1.17.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cab2eba3830bf4f6d91e2d6718e0e1c14a2f5ad1af68a89d24ace0c6b17cced7"}, + {file = "cffi-1.17.0-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:14b9cbc8f7ac98a739558eb86fabc283d4d564dafed50216e7f7ee62d0d25377"}, + {file = "cffi-1.17.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b00e7bcd71caa0282cbe3c90966f738e2db91e64092a877c3ff7f19a1628fdcb"}, + {file = "cffi-1.17.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:41f4915e09218744d8bae14759f983e466ab69b178de38066f7579892ff2a555"}, + {file = "cffi-1.17.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4760a68cab57bfaa628938e9c2971137e05ce48e762a9cb53b76c9b569f1204"}, + {file = "cffi-1.17.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:011aff3524d578a9412c8b3cfaa50f2c0bd78e03eb7af7aa5e0df59b158efb2f"}, + {file = "cffi-1.17.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:a003ac9edc22d99ae1286b0875c460351f4e101f8c9d9d2576e78d7e048f64e0"}, + {file = "cffi-1.17.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ef9528915df81b8f4c7612b19b8628214c65c9b7f74db2e34a646a0a2a0da2d4"}, + {file = "cffi-1.17.0-cp313-cp313-win32.whl", hash = "sha256:70d2aa9fb00cf52034feac4b913181a6e10356019b18ef89bc7c12a283bf5f5a"}, + {file = "cffi-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:b7b6ea9e36d32582cda3465f54c4b454f62f23cb083ebc7a94e2ca6ef011c3a7"}, + {file = "cffi-1.17.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:964823b2fc77b55355999ade496c54dde161c621cb1f6eac61dc30ed1b63cd4c"}, + {file = "cffi-1.17.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:516a405f174fd3b88829eabfe4bb296ac602d6a0f68e0d64d5ac9456194a5b7e"}, + {file = "cffi-1.17.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dec6b307ce928e8e112a6bb9921a1cb00a0e14979bf28b98e084a4b8a742bd9b"}, + {file = "cffi-1.17.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4094c7b464cf0a858e75cd14b03509e84789abf7b79f8537e6a72152109c76e"}, + {file = "cffi-1.17.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2404f3de742f47cb62d023f0ba7c5a916c9c653d5b368cc966382ae4e57da401"}, + {file = "cffi-1.17.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3aa9d43b02a0c681f0bfbc12d476d47b2b2b6a3f9287f11ee42989a268a1833c"}, + {file = "cffi-1.17.0-cp38-cp38-win32.whl", hash = "sha256:0bb15e7acf8ab35ca8b24b90af52c8b391690ef5c4aec3d31f38f0d37d2cc499"}, + {file = "cffi-1.17.0-cp38-cp38-win_amd64.whl", hash = "sha256:93a7350f6706b31f457c1457d3a3259ff9071a66f312ae64dc024f049055f72c"}, + {file = "cffi-1.17.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1a2ddbac59dc3716bc79f27906c010406155031a1c801410f1bafff17ea304d2"}, + {file = "cffi-1.17.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6327b572f5770293fc062a7ec04160e89741e8552bf1c358d1a23eba68166759"}, + {file = "cffi-1.17.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbc183e7bef690c9abe5ea67b7b60fdbca81aa8da43468287dae7b5c046107d4"}, + {file = "cffi-1.17.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bdc0f1f610d067c70aa3737ed06e2726fd9d6f7bfee4a351f4c40b6831f4e82"}, + {file = "cffi-1.17.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6d872186c1617d143969defeadac5a904e6e374183e07977eedef9c07c8953bf"}, + {file = "cffi-1.17.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0d46ee4764b88b91f16661a8befc6bfb24806d885e27436fdc292ed7e6f6d058"}, + {file = "cffi-1.17.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f76a90c345796c01d85e6332e81cab6d70de83b829cf1d9762d0a3da59c7932"}, + {file = "cffi-1.17.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0e60821d312f99d3e1569202518dddf10ae547e799d75aef3bca3a2d9e8ee693"}, + {file = "cffi-1.17.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:eb09b82377233b902d4c3fbeeb7ad731cdab579c6c6fda1f763cd779139e47c3"}, + {file = "cffi-1.17.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:24658baf6224d8f280e827f0a50c46ad819ec8ba380a42448e24459daf809cf4"}, + {file = "cffi-1.17.0-cp39-cp39-win32.whl", hash = "sha256:0fdacad9e0d9fc23e519efd5ea24a70348305e8d7d85ecbb1a5fa66dc834e7fb"}, + {file = "cffi-1.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:7cbc78dc018596315d4e7841c8c3a7ae31cc4d638c9b627f87d52e8abaaf2d29"}, + {file = "cffi-1.17.0.tar.gz", hash = "sha256:f3157624b7558b914cb039fd1af735e5e8049a87c817cc215109ad1c8779df76"}, +] + +[package.dependencies] +pycparser = "*" + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.2" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "idna" +version = "3.7" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.5" +files = [ + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, +] + +[[package]] +name = "outcome" +version = "1.3.0.post0" +description = "Capture the outcome of Python function calls." +optional = false +python-versions = ">=3.7" +files = [ + {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"}, + {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"}, +] + +[package.dependencies] +attrs = ">=19.2.0" + +[[package]] +name = "pycparser" +version = "2.22" +description = "C parser in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, + {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, +] + +[[package]] +name = "pysocks" +version = "1.7.1" +description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"}, + {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"}, + {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"}, +] + +[[package]] +name = "selenium" +version = "4.23.1" +description = "Official Python bindings for Selenium WebDriver" +optional = false +python-versions = ">=3.8" +files = [ + {file = "selenium-4.23.1-py3-none-any.whl", hash = "sha256:3a8d9f23dc636bd3840dd56f00c2739e32ec0c1e34a821dd553e15babef24477"}, + {file = "selenium-4.23.1.tar.gz", hash = "sha256:128d099e66284437e7128d2279176ec7a06e6ec7426e167f5d34987166bd8f46"}, +] + +[package.dependencies] +certifi = ">=2021.10.8" +trio = ">=0.17,<1.0" +trio-websocket = ">=0.9,<1.0" +typing_extensions = ">=4.9,<5.0" +urllib3 = {version = ">=1.26,<3", extras = ["socks"]} +websocket-client = ">=1.8,<2.0" + +[[package]] +name = "sniffio" +version = "1.3.1" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, + {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, +] + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +optional = false +python-versions = "*" +files = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] + +[[package]] +name = "tqdm" +version = "4.66.5" +description = "Fast, Extensible Progress Meter" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"}, + {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + +[[package]] +name = "trio" +version = "0.26.2" +description = "A friendly Python library for async concurrency and I/O" +optional = false +python-versions = ">=3.8" +files = [ + {file = "trio-0.26.2-py3-none-any.whl", hash = "sha256:c5237e8133eb0a1d72f09a971a55c28ebe69e351c783fc64bc37db8db8bbe1d0"}, + {file = "trio-0.26.2.tar.gz", hash = "sha256:0346c3852c15e5c7d40ea15972c4805689ef2cb8b5206f794c9c19450119f3a4"}, +] + +[package.dependencies] +attrs = ">=23.2.0" +cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""} +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +idna = "*" +outcome = "*" +sniffio = ">=1.3.0" +sortedcontainers = "*" + +[[package]] +name = "trio-websocket" +version = "0.11.1" +description = "WebSocket library for Trio" +optional = false +python-versions = ">=3.7" +files = [ + {file = "trio-websocket-0.11.1.tar.gz", hash = "sha256:18c11793647703c158b1f6e62de638acada927344d534e3c7628eedcb746839f"}, + {file = "trio_websocket-0.11.1-py3-none-any.whl", hash = "sha256:520d046b0d030cf970b8b2b2e00c4c2245b3807853ecd44214acd33d74581638"}, +] + +[package.dependencies] +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +trio = ">=0.11" +wsproto = ">=0.14" + +[[package]] +name = "typing-extensions" +version = "4.12.2" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +files = [ + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, +] + +[[package]] +name = "urllib3" +version = "2.2.2" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.8" +files = [ + {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"}, + {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"}, +] + +[package.dependencies] +pysocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""} + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[[package]] +name = "websocket-client" +version = "1.8.0" +description = "WebSocket client for Python with low level API options" +optional = false +python-versions = ">=3.8" +files = [ + {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"}, + {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"}, +] + +[package.extras] +docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"] +optional = ["python-socks", "wsaccel"] +test = ["websockets"] + +[[package]] +name = "wsproto" +version = "1.2.0" +description = "WebSockets state-machine based protocol implementation" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736"}, + {file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"}, +] + +[package.dependencies] +h11 = ">=0.9.0,<1" + +[[package]] +name = "yattag" +version = "1.16.0" +description = "Generate HTML or XML in a pythonic way. Pure python alternative to web template engines.Can fill HTML forms with default values and error messages." +optional = false +python-versions = "*" +files = [ + {file = "yattag-1.16.0.tar.gz", hash = "sha256:0978247b9754d9f44e3703c64374ab9fa872d18de95ac5772fdfdd3c3f0d0706"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.10" +content-hash = "1e0968d348899083a0e9546a3c20a6a43a16fc7eede114227563d98b7796ae71" diff --git a/postprocessor/__main__.py b/postprocessor/__main__.py new file mode 100644 index 0000000..afdc65c --- /dev/null +++ b/postprocessor/__main__.py @@ -0,0 +1,245 @@ +import sys +from collections import defaultdict +from typing_extensions import cast +from yattag import Doc, AsIs +import re +from functools import reduce +import json +import time +import datetime +from pathlib import Path + +SLUG_PATTERN = re.compile(r"[\W_]+") + + +def slugify(value): + value = value.encode("ascii", errors="ignore").decode() + value = SLUG_PATTERN.sub("-", value) + return value.strip("-") + + +JSONFILE = sys.argv[1] +OUTPUTFILE = sys.argv[2] + + +col_titles = { + # included in the CSV + "lab": "Lab", + "chemistry": "Chemistry", + "format": "Format", + "subformat": "Subformat", + "includesSendShipping": "Includes outbound shipping?", + "sendShippingType": "Outbound shipping type", + "returnShippingCost": "Return shipping cost", + "returnShippingType": "Return shipping provider", + "cost": "Development cost", + "resolution": "Scan resolution", + "resolutionName": "Scan resolution name", + "url": "Product URL", + # render only + "outboundShipping": "Outbound shipping", + "returnShipping": "Return shipping", + "renderResolution": "Scan resolution", + "pricePerPixel": "Price per pixel", + "link": "Order page", + "calculatedPrice": "Calculated price", +} + +entries_by_type = defaultdict(lambda: []) +notes_by_type = {} + + +def _render_line(*args, **kwargs): + d = Doc() + d.line(*args, **kwargs) + return d.getvalue() + + +def _format_price(price): + return "£{:.2f}".format(price) + + +raw_data_object = None + +with open(JSONFILE) as f: + raw_data_object = json.load(f) + +for row in raw_data_object["data"]: + entries_by_type[(row["chemistry"], row["format"], row["subformat"])].append(row) + +for row in raw_data_object["notes"]: + notes_by_type[(row["chemistry"], row["format"], row["subformat"])] = row["note"] + +doc, tag, text, line = Doc().ttl() + +doc.asis("") +with tag("html"): + with tag("head"): + doc.stag("meta", charset="utf-8") + doc.stag("meta", name="viewport", content="width=device-width, initial-scale=1") + doc.stag( + "link", + rel="stylesheet", + href="https://www.akpain.net/assets/css/risotto.css", + ) + doc.stag( + "link", + rel="stylesheet", + href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css", + ) + doc.stag( + "link", + rel="stylesheet", + href="https://cdn.jsdelivr.net/npm/simple-datatables@latest/dist/style.css", + ) + with tag("script", src="https://cdn.jsdelivr.net/npm/simple-datatables@latest"): + doc.asis("") + + with tag("body"): + with tag("div", klass="container pt-3"): + + line("a", "[abi abi] $", klass="pe-3", href="https://www.akpain.net") + line("a", "back to photography", href="https://www.akpain.net/photography/") + + line("h1", "Film Development Price Comparison", klass="pt-2") + + line("p", "This is my attempt to work out the best value for money film developing and service that's available in the UK. Labs are compared as like-for-like as possible, but some variation (especially in scan size) is inevitable.") + with tag("p"): + text("If your favourite/local/whatever lab isn't listed here, ") + line("a", "let me know", href="https://www.akpain.net#contact") + text(" and I'll add it! Likewise, if you want to see E6, ECN2, half frame, 120 or anything else here, please do tell me.") + + line( + "p", + "Development costs last updated " + + datetime.datetime.utcfromtimestamp(raw_data_object["time"]).strftime( + "%Y-%m-%d %H:%M:%S" + ) + + ". Price per pixel figures do not include estimates for outbound or return shipping." + ) + + with tag("div", klass="card", style="width: 18rem;"): + with tag("div", klass="card-body"): + line("div", "Contents", klass="card-title", style="font-family: var(--font-monospace)") + with tag("ul", klass="card-text"): + for key in entries_by_type: + chemistry, format, subformat = key + slug = slugify(chemistry + format + subformat) + with tag("li"): + line("a", f"{chemistry} {format} ({subformat})", href=f"#{slug}-title") + + slugs = [] + + for key in entries_by_type: + chemistry, format, subformat = key + + slug = slugify(chemistry + format + subformat) + slugs.append(slug) + + line( + "h2", + f"{chemistry} {format} ({subformat})", + klass="h3 pt-4", + id=slug + "-title", + ) + + if key in notes_by_type: + line("p", notes_by_type[key]) + + cols = [ + ("lab", lambda x: x["lab"]), + ( + "outboundShipping", + lambda x: "×" + if x["includesSendShipping"].lower() == "no" + else x["sendShippingType"], + ), + ( + "returnShipping", + lambda x: ( + "Free" + if (c := float(x["returnShippingCost"])) == 0 + else _format_price(c) + ) + + f" ({x['returnShippingType']})", + ), + ("cost", lambda x: _format_price(float(x["cost"]))), + ( + "renderResolution", + lambda x: f"{x['resolution']} ({repr(x['resolutionName'])})", + ), + ( + "pricePerPixel", + lambda x: "{:.5f}p".format( + float(x["cost"]) + * 100 + / reduce( + lambda y, z: y * z, + map(int, x["resolution"].split("x")), + 1, + ) + ), + ), + ("link", lambda x: _render_line("a", "Link", href=x["url"])), + ] + + # begin working out price per pixel colour scales + pppfn = None + for i, item in enumerate(cols): + if item[0] == "pricePerPixel": + pppfn = item[1] + break + assert pppfn is not None + pppcolours = {pppfn(data): "" for data in entries_by_type[key]} + coldiff = ( + int(120 / (len(pppcolours) - 1)) if len(pppcolours) - 1 != 0 else 0 + ) + for i, (val, rawval) in enumerate( + sorted( + map(lambda x: (float(x[:-1]), x), pppcolours.keys()), + key=lambda y: y[0], + ) + ): + pppcolours[rawval] = f"hsl({120 - (i * coldiff)}, 71%, 73%)" + # end + + with tag("table", klass="table table-hover", id=slug): + with tag("thead"): + with tag("tr"): + for t, _ in cols: + line("th", col_titles[t], scope="col") + + with tag("tbody"): + for data in sorted( + entries_by_type[key], key=lambda x: x["lab"] + ): + with tag("tr"): + for i, (key, fn) in enumerate(cols): + if i == 0: + line("th", fn(data), scope="row") + else: + with tag("td"): + val = fn(data) + doc.asis(val) + + if key == "pricePerPixel": + doc.attr( + style="background-color: " + + pppcolours[val] + ) + + with tag("script"): + doc.asis("const slugs = ") + doc.asis(json.dumps(slugs)) + doc.asis(";\n") + with open(Path(__file__).resolve().parent / "page.js") as f: + doc.asis(f.read()) + + with tag( + "script", + src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js", + ): + doc.asis() + +with open(OUTPUTFILE, "w") as f: + f.write(doc.getvalue()) diff --git a/postprocessor/page.js b/postprocessor/page.js new file mode 100644 index 0000000..003e563 --- /dev/null +++ b/postprocessor/page.js @@ -0,0 +1,9 @@ +// populated by ssg: +// const slugs = []; + +for (const slug of slugs) { + new simpleDatatables.DataTable("#" + slug, { + paging: false, + searchable: false, + }) +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4185e6a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.poetry] +name = "filmdev-scraper" +version = "0.1.0" +description = "" +authors = ["AKP "] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.10" +selenium = "^4.23.1" +yattag = "^1.16.0" +tqdm = "^4.66.5" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/pyrightconfig.json b/pyrightconfig.json new file mode 100644 index 0000000..92b1c26 --- /dev/null +++ b/pyrightconfig.json @@ -0,0 +1,4 @@ +{ + "venvPath": "/home/akp/.cache/pypoetry/virtualenvs", + "venv": "filmdev-scraper-ijnHeEG_-py3.10" +} diff --git a/scraper/__main__.py b/scraper/__main__.py new file mode 100644 index 0000000..34bd70b --- /dev/null +++ b/scraper/__main__.py @@ -0,0 +1,51 @@ +import scrapers +from selenium import webdriver +from tqdm import tqdm +import json +import time +import sys + +OUTPUTFILE = sys.argv[0] + +driver = webdriver.Firefox() + +datapoints = [] + +try: + for cls in tqdm( + [ + scrapers.TheFilmSafe, + scrapers.HarmanLab, + scrapers.AGPhotoLab, + scrapers.FilmProcessingCoUk, + scrapers.PPPCamera, + scrapers.AnalogueWonderland, + scrapers.Minilab, + ] + ): + datapoints += cls(driver).scrape() +finally: + driver.quit() + +with open(OUTPUTFILE, "w") as f: + json.dump( + { + "time": int(time.time()), + "data": datapoints, + "notes": [ + { + "chemistry": "C41", + "format": "35mm", + "subformat": "full frame", + "note": "Assuming one 36-shot roll of film", + }, + { + "chemistry": "B&W", + "format": "35mm", + "subformat": "full frame", + "note": "Assuming one 36-shot roll of film", + }, + ], + }, + f, + ) diff --git a/scraper/scrapers.py b/scraper/scrapers.py new file mode 100644 index 0000000..65c6e42 --- /dev/null +++ b/scraper/scrapers.py @@ -0,0 +1,534 @@ +import re +import time +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions +from selenium.webdriver.support.wait import WebDriverWait +import selenium.common.exceptions + + +class _BaseScraper: + def __init__(self, driver): + self.driver = driver + + +class PPPCamera(_BaseScraper): + def scrape(self) -> list[dict]: + self.driver.get("https://pppcameras.co.uk/lab/p/35mm-film") + + values_to_select = [ + # aria-label value, option value to select + ("Select Services", "Dev + Mid Res"), + ("Select Full Frame scans", "No"), + ("Select Prints", "No Prints"), + ] + + for (aria_label, option_value) in values_to_select: + elem = self.driver.find_element( + By.CSS_SELECTOR, f"select[aria-label={repr(aria_label)}]" + ) + Select(elem).select_by_value(option_value) + + elem = self.driver.find_element(By.CSS_SELECTOR, "div.product-price") + return [ + { + "lab": "PPP Cameras", + "chemistry": "C41", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "no", + "sendShippingType": "", + "returnShippingCost": "0", + "returnShippingType": "Unspecified", + "cost": elem.text.replace("£", ""), + "resolution": "3637x2433", + "resolutionName": "Mid", + "url": "https://pppcameras.co.uk/lab/p/35mm-film", + } + ] + + +class AnalogueWonderland(_BaseScraper): + def scrape(self) -> list[dict]: + return [ + { + "lab": "Analogue Wonderland", + "chemistry": "C41", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "yes", + "sendShippingType": "Royal Mail Tracked 48", + "returnShippingCost": "3", + "returnShippingType": "Unspecified", + "cost": self._scrape_35mm_with_options( + [ + # title of thing to click + "Colour (C-41)", + "Standard Scans", + "Correct and Rotate", + ] + ), + "resolution": "3024x2005", + "resolutionName": "Standard", + "url": "https://analoguewonderland.co.uk/products/35mm-film-development", + }, + { + "lab": "Analogue Wonderland", + "chemistry": "B&W", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "yes", + "sendShippingType": "Royal Mail Tracked 48", + "returnShippingCost": "3", + "returnShippingType": "Unspecified", + "cost": self._scrape_35mm_with_options( + [ + # title of thing to click + "Black and White", + "Standard Scans", + "Correct and Rotate", + ] + ), + "resolution": "3024x2005", + "resolutionName": "Standard", + "url": "https://analoguewonderland.co.uk/products/35mm-film-development", + }, + ] + + def _scrape_35mm_with_options(self, opts: list[str]) -> str: + # opts is a list of titles of buttons to click + + self.driver.get( + "https://analoguewonderland.co.uk/products/35mm-film-development" + ) + + try: + # wait for "free film!!1" popup + elem = WebDriverWait(self.driver, 5).until( + expected_conditions.presence_of_element_located( + (By.CSS_SELECTOR, '[aria-label="Close dialog"]') + ) + ) + elem.click() + time.sleep(1) # wait for animation to play + except selenium.common.exceptions.TimeoutException: + pass + + # make sure it's possible to see the buttons (will raise an exception if we try to click something that's off of the page) by scrolling to the review widget + self.driver.execute_script( + "arguments[0].scrollIntoView(true)", + self.driver.find_element(By.CSS_SELECTOR, ".jdgm-prev-badge__text"), + ) + + for title in opts: + elem = self.driver.find_element( + By.CSS_SELECTOR, f"label.block-swatch__item[title={repr(title)}]" + ) + elem.click() + + return self.driver.find_element( + By.CSS_SELECTOR, "span.price > span.money" + ).text.replace("£", "") + + +class Minilab(_BaseScraper): + def scrape(self) -> list[dict]: + c41 = { + "lab": "The Minilab", + "chemistry": "C41", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "no", + "sendShippingType": "", + "returnShippingCost": "0.85", + "returnShippingType": "Royal Mail 48", + "resolution": "3024x2005", + "resolutionName": "High JPEG", + "url": "https://www.theminilab.co.uk/product-page/c41-dev-scan", + } + c41["cost"] = self._scrape_35mm_with_url(c41["url"]) + + bw = { + "lab": "The Minilab", + "chemistry": "B&W", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "no", + "sendShippingType": "", + "returnShippingCost": "0.85", + "returnShippingType": "Royal Mail 48", + "resolution": "3024x2005", + "resolutionName": "High JPEG", + "url": "https://www.theminilab.co.uk/product-page/b-w-35mm-dev-scan", + } + bw["cost"] = self._scrape_35mm_with_url(bw["url"]) + + return [c41, bw] + + def _scrape_35mm_with_url(self, url) -> str: + self.driver.get(url) + + res_elem_found = WebDriverWait(self.driver, 10).until( + expected_conditions.text_to_be_present_in_element( + (By.CSS_SELECTOR, "label[for]"), "Resolution" + ), + ) # Waiting for the dynamically created form to be dynamiced out of thin air + assert res_elem_found + + dropdown_items = self.driver.find_elements( + By.CSS_SELECTOR, 'div[data-hook="dropdown-base-text"]' + ) + select_item = None + for item in dropdown_items: + if item.text == "Select": + select_item = item + break + + assert select_item is not None + self.driver.execute_script("arguments[0].scrollIntoView(true)", select_item) + select_item.click() + + dropdown_items = self.driver.find_elements( + By.CSS_SELECTOR, "span[aria-hidden=false]" + ) + high_res_item = None + for item in dropdown_items: + if item.text == "High Res JPEG": + high_res_item = item + break + + assert high_res_item is not None + high_res_item.click() + + return self.driver.find_element( + By.CSS_SELECTOR, "span[data-wix-price]" + ).text.replace("£", "") + + +class FilmProcessingCoUk(_BaseScraper): + def scrape(self) -> list[dict]: + c41 = { + "lab": "FilmProcessing.co.uk", + "chemistry": "C41", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "no", + "sendShippingType": "", + "returnShippingCost": "0", + "returnShippingType": "Royal Mail 48", + "resolution": "2728x1830", + "resolutionName": "Standard", + "url": "https://www.filmprocessing.co.uk/onlinestore/35mm-Colour-Film-Processing-p68571250", + } + c41["cost"] = self._scrape_35mm_with_url_and_opts( + c41["url"], + [ + ("Exposure", "Up to 39 Exposure"), + ("Print Size", "No Prints Required"), + ("Extra Sets (Per Film)", "No Extra Set Required"), + ("Film to CD / Dropbox", "Medium Quality Dropbox"), + ], + ) + + bw = { + "lab": "FilmProcessing.co.uk", + "chemistry": "B&W", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "no", + "sendShippingType": "", + "returnShippingCost": "0", + "returnShippingType": "Royal Mail 48", + "resolution": "2728x1830", + "resolutionName": "Standard", + "url": "https://www.filmprocessing.co.uk/onlinestore/35mm-Black-&-White-Film-Processing-p345592049", + } + bw["cost"] = self._scrape_35mm_with_url_and_opts( + bw["url"], + [ + ("Exposures", "Up to 39 Exposures"), + ("Print Size", "No Prints Required"), + ("Extra Sets (per Film)", "No Extra Sets Required"), + ("Film to CD / Dropbox", "Medium Quality Dropbox"), + ], + ) + + return [c41, bw] + + def _scrape_35mm_with_url_and_opts( + self, url: str, opts: list[tuple[str, str]] + ) -> str: + # opts tuples are aria-label value, option value to select + self.driver.get(url) + + WebDriverWait(self.driver, 10).until( + expected_conditions.presence_of_element_located( + (By.CSS_SELECTOR, 'iframe[aria-label="Online Store"][src]') + ) + ) # wait for one iframe to get its source + + all_iframes = self.driver.find_elements( + By.CSS_SELECTOR, 'iframe[aria-label="Online Store"][src]' + ) # get all iframes + # search for the frame that contains the store options + target_iframe = None + for frame in all_iframes: + if url.split("/")[-1] not in frame.get_attribute("src"): + continue + target_iframe = frame + continue + assert target_iframe is not None + + self.driver.get(target_iframe.get_attribute("src")) + + elem = WebDriverWait(self.driver, 10).until( + expected_conditions.text_to_be_present_in_element( + (By.CSS_SELECTOR, "div.product-details-module__title"), "Exposure" + ) + ) # wait for form to be dynamically loaded in + + for (aria_label, option_value) in opts: + elem = self.driver.find_element( + By.CSS_SELECTOR, f"select[aria-label={repr(aria_label)}]" + ) + Select(elem).select_by_value(option_value) + + return self.driver.find_element( + By.CSS_SELECTOR, "span.details-product-price__value" + ).text.replace("£", "") + + +class AGPhotoLab(_BaseScraper): + def scrape(self) -> list[dict]: + c41 = { + "lab": "AG Photo Lab", + "chemistry": "C41", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "yes", + "sendShippingType": "Freepost", + "returnShippingCost": "4.94", + "returnShippingType": "Royal Mail 24", + "resolution": "3089x2048", + "resolutionName": "Standard JPEG", + "url": "https://www.ag-photolab.co.uk/product/c41/", + } + c41["cost"] = self._scrape_35mm_with_url_and_options( + c41["url"], + [ + ("5c8fbe78a2c805.23255089", "35mm_0"), # film format + ("666aa5b7aab344.41469556", "Standard sleeving_0"), # film sleeving + ("5c8fcb67a26bd1.60477546", "Standard Scan 8bit JPEG_0"), # scans + ( + "5c8fcbc6a26c40.29952473", + "Upload files via the web_0", + ), # scan delivery + ], + ) + + bw = { + "lab": "AG Photo Lab", + "chemistry": "B&W", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "yes", + "sendShippingType": "Freepost", + "returnShippingCost": "4.94", + "returnShippingType": "Royal Mail 24", + "resolution": "3089x2048", + "resolutionName": "Standard JPEG", + "url": "https://www.ag-photolab.co.uk/product/black-white/", + } + bw["cost"] = self._scrape_35mm_with_url_and_options( + bw["url"], + [ + ("5c90be26ccc352.83454456", "35mm_0"), # film format + ("5c90c037ccc3d4.45704796", "Standard Sleeving_0"), # film sleeving + ("5c90be26ccc341.38603868", "Standard Scan 8bit JPEG_0"), # scans + ( + "5c90c097ccc3e6.45684541", + "Upload files via the web_0", + ), # scan delivery + ], + ) + + return [c41, bw] + + def _scrape_35mm_with_url_and_options( + self, url: str, opts: list[tuple[str, str]] + ) -> str: + # opts are data-uniqid value, option value to select + self.driver.get(url) + + try: + elem = ( + WebDriverWait(self.driver, 3) + .until( + expected_conditions.presence_of_element_located( + (By.CSS_SELECTOR, "button.cky-btn-reject") + ) + ) + .click() + ) # this cookie popup is big enough that i can see it causing issues so we'll actually get rid of it here + except selenium.common.exceptions.TimeoutException: + pass + + elem = WebDriverWait(self.driver, 10).until( + expected_conditions.presence_of_element_located( + (By.CSS_SELECTOR, f'div[data-uniqid="{opts[0][0]}"]') + ) + ) # wait for the selection boxes to appear + self.driver.execute_script( + "arguments[0].scrollIntoView(true)", + self.driver.find_element(By.CSS_SELECTOR, "h1.product_title"), + ) # make sure it's possible to see the selection boxes + + for (aria_label, option_value) in opts: + elem = self.driver.find_element( + By.CSS_SELECTOR, f"[data-uniqid={repr(aria_label)}]" + ) + elem = elem.find_element(By.TAG_NAME, "select") + Select(elem).select_by_value(option_value) + + return ( + self.driver.find_element(By.CSS_SELECTOR, "span.price.amount.final") + .text.replace(" ", "") + .replace("£", "") + ) + + +class HarmanLab(_BaseScraper): + def scrape(self) -> list[dict]: + c41 = { + "lab": "Harman Lab", + "chemistry": "C41", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "no", + "sendShippingType": "", + "returnShippingCost": "2.95", + "returnShippingType": "Royal Mail 24", + "resolution": "1500x2250", + "resolutionName": "Std", + "url": "https://harmanlab.com/products/developing-only-135-colour-c41-film?variant=42500108189938", + } + c41["cost"] = self._scrape_with_url(c41["url"]) + + bw = { + "lab": "Harman Lab", + "chemistry": "B&W", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "no", + "sendShippingType": "", + "returnShippingCost": "2.95", + "returnShippingType": "Royal Mail 24", + "resolution": "1500x2250", + "resolutionName": "Std", + "url": "https://harmanlab.com/products/black-and-white-film-developing-only?variant=42499934716146", + } + bw["cost"] = self._scrape_with_url(bw["url"]) + + return [c41, bw] + + def _scrape_with_url(self, url: str) -> str: + self.driver.get(url) + return ( + self.driver.find_element( + By.CSS_SELECTOR, "span.price-item.price-item--regular" + ) + .text.replace(" GBP", "") + .replace("£", "") + ) + + +class TheFilmSafe(_BaseScraper): + # Note for the future: they have a bulk discount of £1 per roll + def scrape(self) -> list[dict]: + return [ + { + "lab": "The Film Safe", + "chemistry": "C41", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "no", + "sendShippingType": "", + "returnShippingCost": "2", + "returnShippingType": "Royal Mail 48", + "cost": self._get_price_for_options( + [ + ("product-dropdown-1_2", "C41 (colour)"), # film process + ("product-dropdown-3_4_18", "35mm"), # film format + ( + "product-dropdown-13_14_16_17", + "Med Res JPEG (£9)", + ), # image resolution + ( + "product-dropdown-9_10_11", + "CALM", + ), # charity choice (required to get a price) + ] + ), + "resolution": "3100x2100", + "resolutionName": "Med", + "url": "https://www.thefilmsafe.co.uk/product-page/developing-scanning", + }, + { + "lab": "The Film Safe", + "chemistry": "B&W", + "format": "35mm", + "subformat": "full frame", + "includesSendShipping": "no", + "sendShippingType": "", + "returnShippingCost": "2", + "returnShippingType": "Royal Mail 48", + "cost": self._get_price_for_options( + [ + ("product-dropdown-1_2", "BW"), # film process + ("product-dropdown-3_4_18", "35mm"), # film format + ( + "product-dropdown-13_14_16_17", + "Med Res JPEG (£9)", + ), # image resolution + ( + "product-dropdown-9_10_11", + "CALM", + ), # charity choice (required to get a price) + ] + ), + "resolution": "3100x2100", + "resolutionName": "Med", + "url": "https://www.thefilmsafe.co.uk/product-page/developing-scanning", + }, + ] + + def _get_price_for_options(self, opts: list[tuple[str, str]]) -> str: + # tuples are (id of corresponding label, title of div to click) + self.driver.get( + "https://www.thefilmsafe.co.uk/product-page/developing-scanning" + ) + + WebDriverWait(self.driver, 10).until( + expected_conditions.presence_of_element_located( + (By.CSS_SELECTOR, "label#product-dropdown-1_2[for]") + ) + ) # wait for the dropdown options to get stitched together to the form boxes + + for (label_id, option_title) in opts: + elem = self.driver.find_element( + By.CSS_SELECTOR, f"button[aria-labelledby={repr(label_id)}]" + ) + elem.click() + elem = self.driver.find_element( + By.CSS_SELECTOR, + f'div[data-hook="popover-content"] div[title={repr(option_title)}]', + ) + elem.click() + + return self.driver.find_element( + By.CSS_SELECTOR, "span[data-wix-price]" + ).text.replace("£", "") + + +# TODO: https://www.exposurefilmlab.com/