Skip to content

Commit 7a7303a

Browse files
committed
Also do pypi cleanup
1 parent 4fbf931 commit 7a7303a

File tree

2 files changed

+304
-0
lines changed

2 files changed

+304
-0
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ scripts = [ # dependencies used for running scripts
260260
"pcpp",
261261
"polars",
262262
"pyarrow",
263+
"pyotp>=2.9.0",
263264
"pytz"
264265
]
265266
build = [

scripts/pypi_cleanup.py

Lines changed: 303 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,303 @@
1+
import argparse
2+
import pyotp
3+
import datetime
4+
import logging
5+
import os
6+
import re
7+
import sys
8+
import time
9+
from collections import defaultdict
10+
from html.parser import HTMLParser
11+
from textwrap import dedent
12+
from urllib.parse import urlparse
13+
14+
import requests
15+
from requests.exceptions import RequestException
16+
17+
import argparse
18+
import re
19+
import os
20+
21+
def valid_hostname(hostname):
22+
"""Validate hostname format"""
23+
if len(hostname) > 253:
24+
raise argparse.ArgumentTypeError("Hostname too long (max 253 characters)")
25+
26+
# Check for valid hostname pattern
27+
hostname_pattern = r'^[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*$'
28+
if not re.match(hostname_pattern, hostname):
29+
raise argparse.ArgumentTypeError(f"Invalid hostname format: {hostname}")
30+
31+
return hostname
32+
33+
def non_empty_string(value):
34+
"""Validate non-empty string"""
35+
if not value or not value.strip():
36+
raise argparse.ArgumentTypeError("Value cannot be empty")
37+
return value.strip()
38+
39+
parser = argparse.ArgumentParser(
40+
description="PyPI cleanup script",
41+
epilog="Environment variables required (unless --dry): PYPI_CLEANUP_PASSWORD, PYPI_CLEANUP_OTP"
42+
)
43+
parser.add_argument("--dry", action="store_true", help="Show what would be deleted but don't actually do it")
44+
parser.add_argument("--index-hostname", type=valid_hostname, required=True, help="Index hostname (required)")
45+
parser.add_argument("--max-nightlies", type=int, default=2, help="Max number of nightlies of unreleased versions (default=2)")
46+
parser.add_argument("--username", type=non_empty_string, help="Username (required unless --dry)")
47+
args = parser.parse_args()
48+
49+
# Handle secrets from environment variables
50+
password = None
51+
otp = None
52+
53+
if not args.dry:
54+
if not args.username:
55+
parser.error("--username is required when not in dry-run mode")
56+
57+
password = os.getenv('PYPI_CLEANUP_PASSWORD')
58+
otp = os.getenv('PYPI_CLEANUP_OTP')
59+
60+
if not password:
61+
parser.error("PYPI_CLEANUP_PASSWORD environment variable is required when not in dry-run mode")
62+
if not otp:
63+
parser.error("PYPI_CLEANUP_OTP environment variable is required when not in dry-run mode")
64+
65+
print(f"Dry run: {args.dry}")
66+
print(f"Max nightlies: {args.max_nightlies}")
67+
if not args.dry:
68+
print(f"Hostname: {args.index_hostname}")
69+
print(f"Username: {args.username}")
70+
print("Password and OTP loaded from environment variables")
71+
72+
# deletes old dev wheels from pypi. evil hack.
73+
actually_delete = not args.dry
74+
pypi_username = args.username or "user"
75+
max_dev_releases = args.max_nightlies
76+
host = 'https://{}/'.format(args.index_hostname)
77+
pypi_password = password or "password"
78+
pypi_otp = otp or "otp"
79+
80+
patterns = [re.compile(r".*\.dev\d+$")]
81+
###### NOTE: This code is taken from the pypi-cleanup package (https://github.com/arcivanov/pypi-cleanup/tree/master)
82+
class CsfrParser(HTMLParser):
83+
def __init__(self, target, contains_input=None):
84+
super().__init__()
85+
self._target = target
86+
self._contains_input = contains_input
87+
self.csrf = None # Result value from all forms on page
88+
self._csrf = None # Temp value from current form
89+
self._in_form = False # Currently parsing a form with an action we're interested in
90+
self._input_contained = False # Input field requested is contained in the current form
91+
92+
def handle_starttag(self, tag, attrs):
93+
if tag == "form":
94+
attrs = dict(attrs)
95+
action = attrs.get("action") # Might be None.
96+
if action and (action == self._target or action.startswith(self._target)):
97+
self._in_form = True
98+
return
99+
100+
if self._in_form and tag == "input":
101+
attrs = dict(attrs)
102+
if attrs.get("name") == "csrf_token":
103+
self._csrf = attrs["value"]
104+
105+
if self._contains_input and attrs.get("name") == self._contains_input:
106+
self._input_contained = True
107+
108+
return
109+
110+
def handle_endtag(self, tag):
111+
if tag == "form":
112+
self._in_form = False
113+
# If we're in a right form that contains the requested input and csrf is not set
114+
if (not self._contains_input or self._input_contained) and not self.csrf:
115+
self.csrf = self._csrf
116+
return
117+
118+
119+
class PypiCleanup:
120+
def __init__(self, url, username, package, password, otp, patterns, delete, max_dev_releases, verbose=False):
121+
self.url = urlparse(url).geturl()
122+
if self.url[-1] == "/":
123+
self.url = self.url[:-1]
124+
self.username = username
125+
self.password = password
126+
self.otp = otp
127+
self.do_it = delete
128+
self.package = package
129+
self.patterns = patterns
130+
self.max_dev_releases = max_dev_releases
131+
self.verbose = verbose
132+
133+
def run(self):
134+
csrf = None
135+
136+
if self.verbose:
137+
logging.root.setLevel(logging.DEBUG)
138+
139+
if self.do_it:
140+
logging.warning("!!! WILL ACTUALLY DELETE THINGS !!!")
141+
logging.warning("Will sleep for 3 seconds - Ctrl-C to abort!")
142+
time.sleep(3.0)
143+
else:
144+
logging.info("Running in DRY RUN mode")
145+
146+
logging.info(f"Will use the following patterns {self.patterns} on package {self.package}")
147+
148+
with requests.Session() as s:
149+
with s.get(f"{self.url}/pypi/{self.package}/json") as r:
150+
try:
151+
r.raise_for_status()
152+
except RequestException as e:
153+
logging.error(f"Unable to find package {repr(self.package)}", exc_info=e)
154+
return 1
155+
156+
releases_by_date = {}
157+
for release, files in r.json()["releases"].items():
158+
releases_by_date[release] = max(
159+
[datetime.datetime.strptime(f["upload_time"], '%Y-%m-%dT%H:%M:%S') for f in files]
160+
)
161+
162+
if not releases_by_date:
163+
logging.info(f"No releases for package {self.package} have been found")
164+
return
165+
166+
version_dict = defaultdict(list)
167+
releases = []
168+
for key in releases_by_date.keys():
169+
if '.dev' in key:
170+
prefix, postfix = key.split('.dev')
171+
version_dict[prefix].append(key)
172+
173+
pkg_vers = []
174+
for version_key, versions in version_dict.items():
175+
# releases_by_date.keys() is a list of release versions, so when the version key appears in that list,
176+
# that means the version have been released and we don't need to keep PRE-RELEASE (dev) versions anymore.
177+
# All versions for that key should be added into a list to delete from PyPi (pkg_vers).
178+
# When the version is not released yet, it appears among the version_dict keys. In this case we'd like to keep
179+
# some number of versions (self.max_dev_releases), so we add the version names from the beginning
180+
# of the versions list sorted by date, except for mentioned number of versions to keep.
181+
if version_key in releases_by_date.keys() or self.max_dev_releases == 0:
182+
pkg_vers.extend(versions)
183+
else:
184+
# sort by the suffix casted to int to keep only the most recent builds
185+
sorted_versions = sorted(versions, key=lambda x: int(x.split('dev')[-1]))
186+
pkg_vers.extend(sorted_versions[:-self.max_dev_releases])
187+
188+
if not self.do_it:
189+
print("Following pkg_vers can be deleted: ", pkg_vers)
190+
return
191+
192+
if not pkg_vers:
193+
logging.info(f"No releases were found matching specified patterns and dates in package {self.package}")
194+
return
195+
196+
if set(pkg_vers) == set(releases_by_date.keys()):
197+
print(
198+
dedent(
199+
f"""
200+
WARNING:
201+
\tYou have selected the following patterns: {self.patterns}
202+
\tThese patterns would delete all available released versions of `{self.package}`.
203+
\tThis will render your project/package permanently inaccessible.
204+
\tSince the costs of an error are too high I'm refusing to do this.
205+
\tGoodbye.
206+
"""
207+
),
208+
file=sys.stderr,
209+
)
210+
211+
if not self.do_it:
212+
return 3
213+
for pkg in pkg_vers:
214+
if 'dev' not in pkg:
215+
raise Exception(f"Would be deleting version {pkg} but the version is not a dev version")
216+
217+
if self.username is None:
218+
raise Exception("No username provided")
219+
220+
if self.password is None:
221+
raise Exception("No password provided")
222+
223+
with s.get(f"{self.url}/account/login/") as r:
224+
r.raise_for_status()
225+
form_action = "/account/login/"
226+
parser = CsfrParser(form_action)
227+
parser.feed(r.text)
228+
if not parser.csrf:
229+
raise ValueError(f"No CSFR found in {form_action}")
230+
csrf = parser.csrf
231+
232+
two_factor = False
233+
with s.post(
234+
f"{self.url}/account/login/",
235+
data={"csrf_token": csrf, "username": self.username, "password": self.password},
236+
headers={"referer": f"{self.url}/account/login/"},
237+
) as r:
238+
r.raise_for_status()
239+
if r.url == f"{self.url}/account/login/":
240+
logging.error(f"Login for user {self.username} failed")
241+
return 1
242+
243+
if r.url.startswith(f"{self.url}/account/two-factor/"):
244+
form_action = r.url[len(self.url) :]
245+
parser = CsfrParser(form_action)
246+
parser.feed(r.text)
247+
if not parser.csrf:
248+
raise ValueError(f"No CSFR found in {form_action}")
249+
csrf = parser.csrf
250+
two_factor = True
251+
two_factor_url = r.url
252+
253+
if two_factor:
254+
success = False
255+
for i in range(3):
256+
auth_code = pyotp.TOTP(self.otp).now()
257+
with s.post(
258+
two_factor_url,
259+
data={"csrf_token": csrf, "method": "totp", "totp_value": auth_code},
260+
headers={"referer": two_factor_url},
261+
) as r:
262+
r.raise_for_status()
263+
if r.url == two_factor_url:
264+
logging.error(f"Authentication code {auth_code} is invalid, retrying in 5 seconds...")
265+
time.sleep(5)
266+
else:
267+
success = True
268+
break
269+
if not success:
270+
raise Exception("Could not authenticate with OTP")
271+
272+
for pkg_ver in pkg_vers:
273+
if 'dev' not in pkg_ver:
274+
raise Exception(f"Would be deleting version {pkg_ver} but the version is not a dev version")
275+
if self.do_it:
276+
logging.info(f"Deleting {self.package} version {pkg_ver}")
277+
form_action = f"/manage/project/{self.package}/release/{pkg_ver}/"
278+
form_url = f"{self.url}{form_action}"
279+
with s.get(form_url) as r:
280+
r.raise_for_status()
281+
parser = CsfrParser(form_action, "confirm_delete_version")
282+
parser.feed(r.text)
283+
if not parser.csrf:
284+
raise ValueError(f"No CSFR found in {form_action}")
285+
csrf = parser.csrf
286+
referer = r.url
287+
288+
with s.post(
289+
form_url,
290+
data={
291+
"csrf_token": csrf,
292+
"confirm_delete_version": pkg_ver,
293+
},
294+
headers={"referer": referer},
295+
) as r:
296+
r.raise_for_status()
297+
298+
logging.info(f"Deleted {self.package} version {pkg_ver}")
299+
else:
300+
logging.info(f"Would be deleting {self.package} version {pkg_ver}, but not doing it!")
301+
302+
303+
PypiCleanup(host, pypi_username, 'duckdb', pypi_password, pypi_otp, patterns, actually_delete, max_dev_releases).run()

0 commit comments

Comments
 (0)