Add .gitignore Add Screenshot 2025-05-08 at 21-15-26 Popepedia abi abi.png Add collect.py Add index.html Add tabulate.py
67 lines
1.6 KiB
Python
67 lines
1.6 KiB
Python
import requests
|
|
import time
|
|
import json
|
|
import os
|
|
|
|
wikipedia_token = open("token.txt").read().strip()
|
|
|
|
headers = {
|
|
'Authorization': 'Bearer ' + wikipedia_token,
|
|
}
|
|
|
|
try:
|
|
with open('pope.jsonl', 'rb') as f:
|
|
try: # catch OSError in case of a one line file
|
|
f.seek(-2, os.SEEK_END)
|
|
while f.read(1) != b'\n':
|
|
f.seek(-2, os.SEEK_CUR)
|
|
except OSError:
|
|
f.seek(0)
|
|
last_line = f.readline().decode()
|
|
|
|
ll = json.loads(last_line)
|
|
print(ll)
|
|
last_revision = ll["id"]
|
|
except Exception as e:
|
|
last_revision = "1289422020"
|
|
|
|
# 1289435017 is the last pre-pope edit
|
|
# 1289422020 is a couple edits prior
|
|
|
|
page = 'Pope_Leo_XIV'
|
|
url = 'https://api.wikimedia.org/core/v1/wikipedia/en/page/Pope_Leo_XIV/history?newer_than=' + str(last_revision)
|
|
|
|
n = 0
|
|
# total_delta = 0
|
|
|
|
new = []
|
|
|
|
while True:
|
|
print(url)
|
|
|
|
response = requests.get(url, headers=headers)
|
|
data = response.json()
|
|
|
|
new += list(reversed(data.get("revisions", [])))
|
|
n += len(data.get("revisions", []))
|
|
|
|
# n += len(data.get("revisions", []))
|
|
# total_delta += sum(map(lambda x: abs(x.get("delta", 0)), data.get("revisions", [])))
|
|
print("rl remains", response.headers.get("x-ratelimit-remaining"), "rl resets", response.headers.get("x-ratelimit-reset"))
|
|
|
|
if "newer" not in data:
|
|
break
|
|
|
|
url = data["newer"]
|
|
|
|
print("current", data["revisions"][0]["timestamp"])
|
|
print(f"{n=}")
|
|
|
|
time.sleep(0.5)
|
|
|
|
print("FINAL", n)
|
|
# print("total delta", total_delta)
|
|
|
|
with open("pope.jsonl", "a") as f:
|
|
for item in new:
|
|
f.write(json.dumps(item) + "\n")
|