popepedia/collect.py
AKP 01a5cc6df3 Alter 5 files
Add .gitignore
Add Screenshot 2025-05-08 at 21-15-26 Popepedia abi abi.png
Add collect.py
Add index.html
Add tabulate.py
2025-05-10 17:21:08 +01:00

67 lines
1.6 KiB
Python

import requests
import time
import json
import os
wikipedia_token = open("token.txt").read().strip()
headers = {
'Authorization': 'Bearer ' + wikipedia_token,
}
try:
with open('pope.jsonl', 'rb') as f:
try: # catch OSError in case of a one line file
f.seek(-2, os.SEEK_END)
while f.read(1) != b'\n':
f.seek(-2, os.SEEK_CUR)
except OSError:
f.seek(0)
last_line = f.readline().decode()
ll = json.loads(last_line)
print(ll)
last_revision = ll["id"]
except Exception as e:
last_revision = "1289422020"
# 1289435017 is the last pre-pope edit
# 1289422020 is a couple edits prior
page = 'Pope_Leo_XIV'
url = 'https://api.wikimedia.org/core/v1/wikipedia/en/page/Pope_Leo_XIV/history?newer_than=' + str(last_revision)
n = 0
# total_delta = 0
new = []
while True:
print(url)
response = requests.get(url, headers=headers)
data = response.json()
new += list(reversed(data.get("revisions", [])))
n += len(data.get("revisions", []))
# n += len(data.get("revisions", []))
# total_delta += sum(map(lambda x: abs(x.get("delta", 0)), data.get("revisions", [])))
print("rl remains", response.headers.get("x-ratelimit-remaining"), "rl resets", response.headers.get("x-ratelimit-reset"))
if "newer" not in data:
break
url = data["newer"]
print("current", data["revisions"][0]["timestamp"])
print(f"{n=}")
time.sleep(0.5)
print("FINAL", n)
# print("total delta", total_delta)
with open("pope.jsonl", "a") as f:
for item in new:
f.write(json.dumps(item) + "\n")